示例#1
0
def compare(origin_article_obj, tgt_article_objs):
	tgt_paragraph_docs = []
	tgt_grafs = []

	for obj in tgt_article_objs:
		if obj['paragraphs'] is not None:
			for graf in obj['paragraphs']:
				tgt_grafs.append({
					'text': graf,
					'url': obj['url'],
					'img': obj['img_src'],
					'title': obj['title']
				})
				tgt_paragraph_docs.append(Document(graf, description=obj['url']))

	origin_graf_doc = Document(' '.join(origin_article_obj['paragraphs']), description='origin')

	m = Model(documents=tgt_paragraph_docs+[origin_graf_doc], weight=TFIDF)

	tgts_by_dist = sorted(range(len(tgt_paragraph_docs)), key=lambda i: m.similarity(origin_graf_doc, tgt_paragraph_docs[i]))

	furthest = map(lambda i: tgt_grafs[i], tgts_by_dist)

	furthest_unique = []
	for entry in furthest[::-1]:
		if any([obj['url'] == entry['url'] or obj['text'] == entry['text'] for obj in furthest_unique]):
			pass
		else:
			furthest_unique.append(entry)

	if len(furthest_unique) >= 10:
		return furthest_unique[:10]
	else:
		return furthest_unique
示例#2
0
def articles_to_trends(articles):
    news = {}
    for story in articles:
        if story['added_at']:
            article_text = get_article_text(story['url'])
            d, s = timestamptext(story['added_at'], article_text)

            # Each key in the news dictionary is a date: news is grouped per day.
            # Each value is a dictionary of id => story items.
            # We use hash(story['summary']) as a unique id to avoid duplicate
            # content.
            news.setdefault(d, {})[hash(s)] = s

    m = Model()
    for date, stories in news.items():
        s = stories.values()
        s = ' '.join(s).lower()
        # Each day of news is a single document.
        # By adding all documents to a model we can calculate tf-idf.
        m.append(Document(s, stemmer=LEMMA, exclude=[
                 'news', 'day'], name=date))

    for document in m:
        print document.name
        print document.keywords(top=10)
示例#3
0
 def calculate(self,minePackage,progress):
     webDocuments=[]
     query=Document((minePackage['searchKey']))
     clouds=minePackage['clouds']
     count=UnPack()
     totalLinks=count.total(clouds)
     progress.set_totalIR(totalLinks)#Total de documentos a recuperar
     progress.set_IRState('Ejecutando')#Actualiza el estado del proceso
     urlContent=UrlToPlainText()
     step=0
     for cloud in clouds:
         if not progress.get_stop():
             for n in cloud.graph.nodes():
                 if not progress.get_stop():
                     doc=cloud.graph.node[n]['methodData']
                     webDocuments.append(Document(doc.getData()))
                     step+=1
                     progress.set_IRProgress(step)#Progreso del proceso paso a paso
                 else:
                     break
         else:
             break
     if not progress.get_stop():
         m=Model(documents=webDocuments, weight=TFIDF)
         for cloud in clouds:
             for n in cloud.graph.nodes():
                 methodData=cloud.graph.node[n]['methodData']
                 vector=Document(methodData.getData())
                 cloud.graph.node[n]['weight_VSM']=m.similarity(vector,query)
示例#4
0
def crearIndiceInvertidoCuerpoMensajes():
    cuerpo1='''
    Estimados socios:

    ya hemos firmado el contrato de compraventa con el cliente preferencial.
    Espero noticias vuestras.

    Un saludo,
    '''
    cuerpo2='''
    Estimados Antonio:

    agradezco mucho tus buenas noticias, aunque me temo que el documento que debe adjuntarse al contrato se va a retrasar
    unos dias.
    
    Un saludo,
    '''
    
    cuerpo3='''
    Estimados socios:

    aunque el contrato no este legalizado aun, me he permitido hacer una transferencia por 
    la mitad del importe al contratista.
    
    Un saludo,
    '''
    cuerpo4='''
    Estimados socios:

    muchas gracias por las gestiones. se lo comunicare al cliente hoy mismo.
    Un saludo,
    '''
    
    cuerpo5='''
    Estimado Luis:

    ya hemos realizado una transferencia a su cuenta por el importe establecido inicialmente.
    
    Un saludo,
    '''
    
    cuerpo6='''
    Un saludo,
    '''
    
    correo1 = Document(cuerpo1, name="correo1",threshold=0,stopwords = True,language = 'es')
    correo2 = Document(cuerpo2, name="correo2",threshold=0,stopwords = True,language = 'es')
    correo3 = Document(cuerpo3, name="correo3",threshold=0,stopwords = True,language = 'es')
    correo4 = Document(cuerpo4, name="correo4",threshold=0,stopwords = True,language = 'es')
    correo5 = Document(cuerpo5, name="correo5",threshold=0,stopwords = True,language = 'es')
    correo6 = Document(cuerpo6, name="correo6",threshold=0,stopwords = True,language = 'es')
    
    modeloCorreos = Model(documents=[correo1,correo2,correo3,correo4,correo5,correo6], weight=TFIDF) 
    correos = modeloCorreos.documents
    informacion = ""
    for correo in correos:
        palabras = correo.features 
        for palabra in palabras:
            informacion +="Palabra: "+str(palabra) +" => índice invertido: "+str(modeloCorreos.idf(palabra))+", correo: "+correo.name+"\n"
    print informacion
示例#5
0
def word_ranking(text, n='L2'):
    """
    extract most relevant sentences from text according to LSA algorithm
    steps:    
    1. tokenize text by sentences
    2. compute tfidf matrix
    3. applying SVD of tfidf matrix (reduce to n-dimensions) 
    4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf)
        
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    #==============================================================================
    #     #synctatic filter
    #     exclude_list = []
    #     for sent in sentences:
    #         for word, pos in tag(sent):
    #             if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns.
    #                 exclude_list.append(word.lower())
    #==============================================================================

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dimensions number equal to euclidean norm of singular values
    # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False)
    # dimensions=int(round(np.linalg.norm(S, 2)))
    m.reduce(dimensions=n)

    # sentences selection according to cross-method
    # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf
    # topic(rows) x tokens(cols) matrix(tfidf)
    V = np.array(m.lsa.vt)

    # average sentence score for each concept/topic by the rows of the Vt matrix
    avg_score = np.mean(V, axis=1).reshape((-1, 1))

    # cell values which are less than or equal to the average score are set to zero
    V[V <= avg_score] = 0.0

    # sigma natrix after svd performing
    S = np.array(m.lsa.sigma).reshape((-1, 1))

    # total length of each sentence vector
    length = np.sum(V * S, axis=0)

    # ranking words by length score
    ranking = Counter(dict(zip(m.lsa.terms, length)))  #.most_common(n)

    #words, score =  list(zip(*ranking))

    return ranking
示例#6
0
def feeds_to_trends(feeds):
    for url in feeds:
        url = url['feed_url']
        news = {}
        try:
            for story in Newsfeed().search(url, cached=False):
                d, s = datetext(story.date, story.description)

                # Each key in the news dictionary is a date: news is grouped per day.
                # Each value is a dictionary of id => story items.
                # We use hash(story.description) as a unique id to avoid duplicate
                # content.
                news.setdefault(d, {})[hash(s)] = s

            m = Model()
            for date, stories in news.items():
                s = stories.values()
                s = ' '.join(s).lower()
                # Each day of news is a single document.
                # By adding all documents to a model we can calculate tf-idf.
                m.append(Document(s, stemmer=LEMMA, exclude=[
                         'news', 'day'], name=date))

            for document in m:
                print document.name
                print document.keywords(top=10)
        except HTTP404NotFound:
            print url
            pass
示例#7
0
def getMod():
    essay_path = 'essays/original/'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    for f in files:
        with io.open(f, 'r', encoding='utf-8') as w:
            text = TextBlob(PageParser.parse(w.read()))
            text = ' '.join([
                word for word in text.words if word not in cachedStopWords
            ]).lstrip()
            #ent_text = ' '.join(er.recognize_entities(text.sentences))
            #ent_text = PageParser.parse(w.read())
            docs.append(Document(text, name=f, top=40))
    m = Model(docs)
    lsa = m.reduce(5)
    return lsa
    # Clustering could be a useful technique, commenting out for now
    #with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
    #	write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "")

    with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
        for i, concept in enumerate(m.lsa.concepts):
            print("Concept {0}:".format(i)),
            w.write(unicode("Concept {0}:".format(i)))
            count = 0
            # Show top only first 5 features we come across
        for feature, weight in m.lsa.concepts[i].items():
            if abs(weight) > 0.2:
                print(feature),
                w.write(feature + " ")
                count += 1

            if count > 5:
                break
        w.write(unicode('\n'))
        #print

        cat_docs = []
        for d in m.documents:
            cat = (0, 0, {})
            #print d.name.split('\\')[-1]
            for idx, weight in m.lsa.vectors[d.id].items():
                print "\tCat {0}: {1}".format(idx, weight)
                if abs(weight) > abs(cat[1]) or cat[1] == 0:
                    cat = (idx, weight, d)

            if cat[0] == i:
                cat_docs.append(cat)
                #print "\t{0}".format(d.name.split('\\')[-1])

        cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True)
        for cat, weight, d in cat_docs:
            f = d.name.split('\\')[-1]
            w.write(
                unicode("\t{0} - {1}\n").format(
                    filter(lambda x: x in string.printable, f), weight))
示例#8
0
文件: ftes.py 项目: kqdtran/FTES
def bag_of_words_tfidf(lst):
    '''
    Constructs a bag of words model, where each document is a Facebook post/comment
    Also applies TFIDF weighting, lemmatization, and filter out stopwords
    '''
    model = Model(documents=[], weight=TFIDF)
    for msg, link in lst:
        doc = Document(msg, stemmer=LEMMA, stopwords=True, name=msg, description=link)
        model.append(doc)
    return model
示例#9
0
文件: lsa.py 项目: wframe/421_Final
def getMod():
    essay_path = 'essays/original/'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    for f in files:
        with io.open(f, 'r', encoding='utf-8') as w:
	        text = TextBlob(PageParser.parse(w.read()))
	        text = ' '.join([word for word in text.words if word not in cachedStopWords]).lstrip()
            #ent_text = ' '.join(er.recognize_entities(text.sentences))
	        #ent_text = PageParser.parse(w.read())
	        docs.append(Document(text, name=f, top=40))
    m = Model(docs)
    lsa = m.reduce(5)
    return lsa
	# Clustering could be a useful technique, commenting out for now
	#with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
	#	write_cluster(m.cluster(method=HIERARCHICAL, k=4), w, "")

    with io.open(r'lsa.txt', 'w+', encoding='utf-8') as w:
	    for i,concept in enumerate(m.lsa.concepts):
		    print("Concept {0}:".format(i)),
		    w.write(unicode("Concept {0}:".format(i)))
		    count = 0
		    # Show top only first 5 features we come across
            for feature, weight in m.lsa.concepts[i].items(): 
	            if abs(weight) > 0.2:
		            print(feature),
		            w.write(feature + " ")
		            count += 1

	            if count > 5:
		            break
            w.write(unicode('\n'))
            #print 

            cat_docs = []
            for d in m.documents:
				cat = (0,0, {})
				#print d.name.split('\\')[-1]
				for idx,weight in m.lsa.vectors[d.id].items():
					print "\tCat {0}: {1}".format(idx, weight)
					if abs(weight) > abs(cat[1]) or cat[1] == 0:
						cat = (idx,weight,d)

				if cat[0] == i:
					cat_docs.append(cat)
					#print "\t{0}".format(d.name.split('\\')[-1])

            cat_docs.sort(key=lambda tup: abs(tup[1]), reverse=True)
            for cat,weight,d in cat_docs:
	            f = d.name.split('\\')[-1]
	            w.write(unicode("\t{0} - {1}\n").format(filter(lambda x: x in string.printable, f), weight))
示例#10
0
def runTFIDF():
  """Given a list of classes, construct a Vector-Space Model.
  We only need to do it once and save it to a pickle file for
  fast loading later on"""

  model = Model(documents=[], weight=TFIDF)
  for r, d, files in os.walk("project/data/"):
    for f in files:
      if f.endswith(".txt"):
        text = readFile(f)
        doc = Document(text, stemmer=LEMMA, stopwords=True, name=f.replace(".txt", ""))
        model.append(doc)
  model.save("project/pickle/course.pic")
示例#11
0
def extract():
    print 'Extracting features from app descriptions...\n'
    if os.path.exists(OUTPUT_PATH):
        shutil.rmtree(OUTPUT_PATH)
    os.makedirs(OUTPUT_PATH)

    for dir in os.listdir(INPUT_PATH):
        if not dir.startswith('.'):
            os.makedirs("{}/{}".format(OUTPUT_PATH, dir))
            for file in os.listdir('{}/'.format(INPUT_PATH) + dir):
                with open('{}/{}/{}'.format(INPUT_PATH, dir, file), 'rb') as f:
                    reader = csv.reader(f)
                    next(reader)
                    with open('{}/{}/{}'.format(OUTPUT_PATH, dir, file),
                              'wb') as r:
                        writer = csv.writer(r)
                        for app in reader:
                            name = app[0]
                            description = app[2]

                            # Prepare an app description string for NLTK and LDA processing
                            preparedDescription = prepare_description(
                                description)

                            # Extract 3 word featurlets from the description
                            featurelets = featurelet_extraction(
                                preparedDescription)

                            list = []
                            for feature in featurelets:
                                featurelet = '{} {} {}'.format(
                                    feature[0], feature[1], feature[2])
                                list.append(
                                    Document(featurelet, name=featurelet))

                            # Perform hierarchical clustering
                            m = Model(list)
                            cluster = m.cluster(method=HIERARCHICAL,
                                                k=3,
                                                iterations=1000,
                                                distance=COSINE)

                            # Organize clusters into features and alternative tokens
                            (features,
                             alterTokens) = group(cluster, [], [], [])

                            # Write results to file
                            writer.writerow(
                                [name, description, features, alterTokens])
                        r.close()
                    f.close()
示例#12
0
文件: nlp.py 项目: krukmat/smartnews
def compute_topics(set_reduce_topics, today):
    # Based on similarity
    # Based on words
    cleanup_topic(today.day, today.month, today.year)
    ScrapedTopicGroups.sync()
    sites = SiteNewsScrapedData.objects.all()
    documents = []
    for site in sites:
        for sentence in site.content.split('.'):
            if sentence:
                tree = parsetree(sentence, lemmata=True)
                if len(tree) > 0:
                    documents.append(tree[0])

    documents = [[w.lemma for w in document if
                  w.tag.startswith((u'NN', u'NNS', u'NNP', u'NNPS')) and w.lemma not in settings.STOP_WORDS] for
                 document in documents]

    documents = [Document(" ".join(document) + '.') for document in documents if len(document) > 1]
    model = Model_Comp(documents=documents)

    # format: (distribution, Document)
    documents_analyzed = []
    for document in documents:
        tokens = []
        similar_items_news = model.nearest_neighbors(document)
        for similarity, sim_document in similar_items_news:
            if similarity > 0.95 and sim_document.id not in documents_analyzed:
                tokens.extend([word for word, _ in sim_document.words.iteritems()])
                documents_analyzed.append(sim_document.id)
        # Added is there some document similar
        if document.id not in documents_analyzed:
            tokens.extend([word for word, _ in document.words.iteritems()])
            documents_analyzed.append(document.id)
        # filter the most relevant words (based on count)
        counter = defaultdict(int)
        for token in tokens:
            counter[token] += 1
        # Order counter desc
        tokens_org = sorted(counter.items(), key=lambda element: element[1], reverse=True)
        tokens = [token for token, count in tokens_org[:3]]
        if tokens and len(tokens) > 0:
            links = SiteNewsScrapedData.find_coincidences(tokens)
            # Filtrar solamente si tiene mas de 3 links
            if len(links) > 3:
                ScrapedTopicGroups.create(tags=tokens, links=links, relevance=len(links),
                                        day=today.day, month=today.month, year=today.year)
    if set_reduce_topics:
        reduce_topics(today.day, today.month, today.year)
    return True
示例#13
0
def summarize(text, n=1):
    """
    extract most relevant sentences from text according to TextRank algorithm
    - text: string consisting of a few sentences
    - n: number of sentences to extract
    """
    # tokenize text to sentences list
    sentences = tokenize(text)

    # create documents list
    # stop words and punctuation erase by default
    docs = [Document(sentences[i], name=i) for i in range(len(sentences))]

    # model initialize
    m = Model(docs, weight=TFIDF)

    # dict of TextRank ranking of cosine similarity matrix
    ranking = utils.textrank(m.documents, m.distance)

    # indexes of top n sentences
    top_sents_idx, _ = list(zip(*ranking.most_common(n)))

    # reordering
    output = [sentences[i] for i in sorted(top_sents_idx)]

    return ''.join(output)
示例#14
0
def create_model(doc_list):
    '''
    Given a list of documents in Pattern.Vector Document format, create a
    Pattern.Vector Model.
    '''
    print "Creating a TFIDF model for {} documents".format(len(doc_list))
    return Model(documents=doc_list, weight=TFIDF)
示例#15
0
def build_model(results=[]):
    documents = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in results
    ]
    m = Model(documents, weight=TFIDF)

    y, x = 1, len(m.features)
    model = np.zeros((y, x))

    sentence_dict = {}
    model_sentences = []
    for i_index, i in enumerate(documents):
        sentences = sent_tokenize(results[i_index].get('text').lower())

        dy, dx = len(sentences), x
        for s_index, s in enumerate(sentences):
            s_words = {
                w: 1
                for w in words(s, stemmer=LEMMA, stopwords=False)
                if not stopwords_hash.get(w)
            }
            if len(s_words) < 5:
                continue
            model_sentences.append(s)
            model = np.append(
                model, [[1 if s_words.get(w) else 0 for w in m.features]], 0)
            sentence_dict[model.shape[0] - 1] = i.name
            # model_sentences[model.shape[0]-1] = s

    model = np.delete(model, (0), 0)

    return model, m, model_sentences, sentence_dict
	def modeling(self, descriptions, field = False, limit = False):
		"""Model returns a pattern.vector.Model object which is a list of pattern.vector.Document using Ehri.Get() descriptions
		Keyword arguments:
		descriptions ---	EHRI.get() description object
		field 	---		Field to look into, override defaut self.field
		limit	---		Debug option. Limit the model to $limit items
		"""
		if field:
			self.field = field
		if limit:
			self.limit = limit
		D = []
		
		#Creating Pattern Document element from data we got from Neo4J
		#
		
		#For debug reasons, we could set a limit
		if self.limit:
			i = 0
		for description in descriptions:
			D.append(Document(description[self.field], name=description[self.identifier]))
			#And stop the iteration when i reaches the limit
			if self.limit:
				i += 1
				if i == self.limit:
					break
		#Then, creating a model from our array
		self.model = Model(D)
		
		return self.model 
示例#17
0
def runTFIDFOnSchedule(term=util.currentTerm, year=util.currentYear):
    """
    Given a list of classes, construct a Vector-Space model
    and apply the TFIDF algorithm to measure similarity between courses
    """

    model = Model(documents=[], weight=TFIDF)
    print "Loading from pickle file..."
    allCourses = loadAllCoursesInTerm()
    print "Begin constructing the Vector Space model"

    for course in allCourses:
        text = course.title + " " + course.description
        doc = Document(text, stemmer=LEMMA, stopwords=True, name=course.title,\
                                     description=course)
        model.append(doc)
    print "Finish processing!!!"
    with open("pickle/simCourses" + term + year + ".pickle", "w") as f:
        dump(model, f, 0)
    return model
示例#18
0
 def buscaCorreo2(x):
     documents = []
     documap = {}
     for archivo in os.listdir("Correos"):
         if archivo.endswith(".txt"):
             f = open("Correos/" + archivo, "r")
             f.readline()
             f.readline()
             f.readline()
             f.readline()
             mailbody = f.read()
             f.close()
             docu = Document(mailbody, name=archivo)
             documents.append(docu)
             docukey = int(archivo[0:-4])
             documap[docukey] = docu
     model = Model(documents=documents, weight=TFIDF)
     docu = documap[int(var.get())]
     tupla = model.neighbors(docu, top=1)[0]
     tkMessageBox.showinfo("Tk", "El documento que mas se parece es el " + tupla[1].name[0:-4] + ", con un " + str(tupla[0]) + " de similitud")
示例#19
0
 def calculate(self, minePackage):
     webDocuments = []
     query = Document((minePackage['searchKey']))
     clouds = minePackage['clouds']
     count = UnPack()
     totalLinks = count.total(clouds)
     urlContent = UrlToPlainText()
     step = 0
     for cloud in clouds:
         for n in cloud.graph.nodes():
             doc = cloud.graph.node[n]['methodData']
             webDocuments.append(Document(doc.getData()))
             step += 1
     m = Model(documents=webDocuments, weight=TFIDF)
     for cloud in clouds:
         for n in cloud.graph.nodes():
             methodData = cloud.graph.node[n]['methodData']
             vector = Document(methodData.getData())
             cloud.graph.node[n]['weight_VSM'] = m.similarity(
                 vector,
                 query)  #SETEA EL VALOR DE VSM EN EL CLOUD!!!!!!!!!!
示例#20
0
    def rankingSVM(self, listaUrls, consulta, parametros):
        """ metodo para rankear una lista de urls mediante el algoritmo RSVM
            Entrada:
                listaUrls: lista de los urls para rankear
                consulta: consulta de busqueda en cadena de caracteres
                parametros: parametros
            Salida:
                lista de urls rankeados
        """

        self.preprocesamiento.lecturaSVMRanking(listaUrls, consulta)
        """ creacion de atributos para cada enlace"""
        listaUrls = self.setearAtributosRanking(listaUrls, consulta)
        """se obtiene los puntos para realizar el ranking"""
        puntos = self.getAtributosRanking(listaUrls, consulta.name)
        X = np.array(puntos['X'])

        svmNorelevante = joblib.load('Model/SVM/norelevante.pkl')
        svmRelevante = joblib.load('Model/SVM/relevante.pkl')
        svmMuyrelevante = joblib.load('Model/SVM/muyrelevante.pkl')

        prediccionesNoRelevante = svmNorelevante.predict(X)
        prediccionesRelevante = svmRelevante.predict(X)
        prediccionesMuyRelevante = svmMuyrelevante.predict(X)

        listaUrls = self.preprocesamiento.limpiarListaUrls(
            listaUrls, puntos['name'])
        ranking = []

        modeloLista = []
        for url in listaUrls:
            documento = self.mongodb.getDocumento(url)
            if documento:
                documentoPattern = self.preprocesamiento.getDocumentoPattern(
                    documento['_id'])
                modeloLista.append(documentoPattern)

        unModelo = Model(modeloLista)
        """calculo del puntaje de ranking SVM"""
        for indice, doc in enumerate(unModelo):
            url = doc.name
            documento = {}
            documento['url'] = url
            documento['score'] = (
                1 - self.obtenerVectorSpaceModel(doc, consulta)) + (
                    prediccionesNoRelevante[indice] +
                    prediccionesRelevante[indice] * parametros[1] +
                    prediccionesMuyRelevante[indice] * parametros[2])
            ranking.append(documento)

        listaNueva = sorted(ranking, key=lambda k: k['score'], reverse=True)
        return listaNueva
示例#21
0
	def kmeansCluster(self, documentList, k, iteration, distance, seed, p):
		if distance.lower() == "cosine":
			distance = COSINE
		elif distance.lower() == "euclidean":
			distance = EUCLIDEAN
		elif distance.lower() == "manhattan":
			distance = MANHATTAN
		else:
			return "invalid distance"

		if seed.lower() == "kmpp":
			seed = KMPP
		elif seed.lower() =="random":
			seed = RANDOM
		else:
			return "invalid random"
		
		if type(k) is not int:
			return "k is not int"

		if type(iteration) is not int:
			return "iterartion is not int"

		if type(p) is not float and type(p) is not int:
			return "p is not float"

		if type(documentList) is not list:
			return "document List is not list"

		self.iteration = iteration
		self.seed = seed
		self.p = p
		self.distance = distance
		
		model = Model(documentList)
		cluster = model.cluster(method=KMEANS, k=k, iterations=iteration, distance=distance,seed=seed,p=p)
		return cluster
示例#22
0
def runTFIDFOnCatalog(term=util.currentTerm, year=util.currentYear):
    """
    Given a dictionary of courses, construct a Vector-Space model
    and apply the TFIDF algorithm to measure similarity between courses.

    We only need to do it once and save it to a pickle file for
    fast loading later on
    """

    model = Model(documents=[], weight=TFIDF)
    print "Loading from pickle file..."
    allCoursesDict = loadCourseCatalog()

    for dept in allCoursesDict:
        print "Processing department", dept
        for course in allCoursesDict[dept]:
            text = course.title + " " + course.description
            doc = Document(text, stemmer=LEMMA, stopwords=True, name=course.title,\
                                         description=course)
            model.append(doc)
        print "Finish processing", dept, "\n"
    with open("pickle/simCatalog" + term + year + ".pickle", "w") as f:
        dump(model, f, 0)
    return model
def get_model_from_documents(path='./*/*.txt'):
    '''return model from given txt files'''
    import codecs
    import glob
    from pattern.vector import Document, Model, TFIDF

    documents = []
    files = glob.glob('./*/*.*')
    for file in files:
        f = codecs.open(file, 'r')
        data = f.read()
        document = Document(data)
        documents.append(document)

    model = Model(documents=documents, weight=TFIDF)
    return documents, model
示例#24
0
def GetVectors():
    essay_path = 'training'
    files = fio.recGetTextFiles(path.abspath(essay_path))
    docs = []
    percepticon = PerceptronTagger()
    cat_dict = defaultdict(int)
    for f in files:
        extended_text = ExtendText(f, percepticon)
        name = ''
        cats = ['high', 'medium', 'low']
        for cat in cats:
            if cat in f:
                name = cat + str(cat_dict[cat])
                cat_dict[cat] += 1
        docs.append(Document(extended_text, name=name, top=None))
    m = Model(docs)
    #lsa = m.reduce(5)
    return m
示例#25
0
def r2iterator_to_model(collection, query):
    r2_list = []
    for r2 in collection.get_by_example(query):
        try:
            strings = [r2['program_desc']]
            for projid in r2['projects'].keys():
                try:
                    strings.append(r2['projects'][projid]['mission_desc'])
                except KeyError:
                    pass
            try:
                doc = Document(" ".join(strings), name=r2['_id'])
                r2_list.append(doc)
            except TypeError as e:
                print repr(e)
                print r2['_id']
        except KeyError as e:
            print repr(e)
    return Model(r2_list)
示例#26
0
    def rankingVectorSpaceModel(self, listaUrls, consulta):
        """metodo para el ranking mediante VSM
        Entrada: Consulta de busqueda en string, y lista de urls
        Salida: lista final rankeado"""
        listaUrlsRankeados = []
        listaModel = []
        for url in listaUrls:
            documento = self.mongodb.getDocumento(url)
            if documento:
                documentoPattern = self.preprocesamiento.getDocumentoPattern(
                    documento['_id'])
                listaModel.append(documentoPattern)

        unModelo = Model(listaModel, weight=TFIDF)
        for unDocumento in unModelo:
            score = self.svm.calcularVectorSpaceModel(consulta, unDocumento)
            listaUrlsRankeados.append(
                self.crearJsonRanking(unDocumento.name, score))

        listaFinal = sorted(listaUrlsRankeados,
                            key=lambda k: k['score'],
                            reverse=False)

        return listaFinal
示例#27
0
文件: 04-KNN.py 项目: ADA110/Cibus
from pattern.en import Sentence, parse
from pattern.search import search
from pattern.vector import Document, Model, KNN

# Classification is a supervised machine learning method,
# where labeled documents are used as training material
# to learn how to label unlabeled documents.

# This example trains a simple classifier with Twitter messages.
# The idea is that, if you have a number of texts with a "type"
# (mail/spam, positive/negative, language, author's age, ...),
# you can predict the type of other "unknown" texts.
# The k-Nearest Neighbor algorithm classifies texts according
# to the k documents that are most similar (cosine similarity) to the given input document.

m = Model()
t = Twitter()

# First, we mine a model of a 1000 tweets.
# We'll use hashtags as type.
for page in range(1, 10):
    for tweet in t.search('#win OR #fail', start=page, count=100, cached=True):
        # If the tweet contains #win hashtag, we'll set its type to 'WIN':
        s = tweet.text.lower()  # tweet in lowercase
        p = '#win' in s and 'WIN' or 'FAIL'  # document labels
        s = Sentence(parse(s))  # parse tree with part-of-speech tags
        s = search('JJ', s)  # adjectives in the tweet
        s = [match[0].string for match in s]  # adjectives as a list of strings
        s = " ".join(s)  # adjectives as string
        if len(s) > 0:
            m.append(Document(s, type=p, stemmer=None))
def main():

    ##############################################################################################
    print('QUESTION 1, Part I: Web Crawling: Extraction of Book Titles')
    print("-" * 70)
    print('\n')
    print(
        'Retrieving Book Titles from the first two pages of Amazon search results! \n'
    )
    print('Please wait a minute... \n')

    print("~" * 70)

    #open the base URL webpage
    level_1_url = "https://www.amazon.com/s?url=search-alias%3Daps&field-keywords=Martin+Heidegger"

    all_titles = get_titles(level_1_url)

    #print with text wrapping
    format = '%s'

    pieces = [format % (ttl) for ttl in all_titles]
    output = ' | '.join(pieces)
    ttls = fill(output)
    print('The scraped book titles are:')
    print("_" * 40)
    print('\n')
    print('\n\n'.join(ttls.split('|')))
    print('\n')

    ##############################################################################################
    print(
        'QUESTION 1, Part II: Pairwise Text Cosine Similarity Scores of Book Titles'
    )
    print("-" * 70)
    print('\n')

    doc_list = []
    for i in range(len(all_titles)):
        doc_list.append(
            Document(all_titles[i], type=" ".join(all_titles[i].split())))

    m = Model(documents=doc_list, weight=TFIDF)

    cos_similarities = [(m.similarity(x, y), m.documents[i].type,
                         m.documents[j].type)
                        for i, x in enumerate(m.documents)
                        for j, y in enumerate(m.documents) if i != j]

    unique_cos_sim = [
        tuple(x) for x in set(map(frozenset, cos_similarities))
        if len(tuple(x)) == 3
    ]

    resorted_cos_sim_ttl = []
    for i in range(len(unique_cos_sim)):
        resorted_cos_sim_ttl.append(
            sorted(tuple(str(e) for e in unique_cos_sim[i])))
        resorted_cos_sim_ttl[i][0] = float(resorted_cos_sim_ttl[i][0])
        resorted_cos_sim_ttl[i] = tuple(resorted_cos_sim_ttl[i])

    print(
        'The number of calculated book title cosine similarity scores is: {} \n'
        .format(len(resorted_cos_sim_ttl)))

    print(
        'All non-zero book title cosine similarity scores, from smallest to largest: \n'
    )
    for tup in sorted(resorted_cos_sim_ttl):
        if tup[0] != 0:
            print(tup[0])
    print('\n')

    print("~" * 70)

    #print with text wrapping
    format = '%s'

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_ttl, key=lambda t: t[0], reverse=True)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The cosine similarity scores of the five most similar book titles are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    print("~" * 70)

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_ttl, key=lambda t: t[0], reverse=False)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The cosine similarity scores of the five most dissimilar book titles are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    #############################################################################################
    print(
        'QUESTION 1, Part III: Most Similar and Dissimilar Book Titles and Search Rankings'
    )
    print("-" * 70)
    print('\n')

    print('The most similar pair of book titles is: \n')
    print(max(resorted_cos_sim_ttl))
    print('\n')

    print('The most dissimilar pair of book titles is: \n')
    print(min(resorted_cos_sim_ttl))
    print('\n')

    print("~" * 70)

    doc_types = [doc.type for doc in m.documents]

    print(
        'The search ranking of the first element of the most similar book title pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_ttl)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most similar book title pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_ttl)[2]))
    print('\n')

    print(
        'The search ranking of the first element of the most dissimilar book title pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_ttl)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most dissimilar book title pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_ttl)[2]))
    print('\n')

    #############################################################################################
    print('QUESTION 2, Part I: Web Crawling: Extraction of Search Capsules')
    print("-" * 70)
    print('\n')

    orig_query = 'Ponderings XII–XV: Black Notebooks 1939–1941 (Studies in Continental Thought)'

    level_1_url = "https://www.google.com/search?q=" + orig_query.replace(
        ' ', '+')

    all_capsules = get_capsules(level_1_url)

    all_capsules_clean = []
    for cp in all_capsules:
        all_capsules_clean.append(
            unicodedata.normalize('NFKD', cp).encode('ascii',
                                                     'ignore').decode('utf-8'))

    #print with text wrapping
    format = '%s'

    pieces = [format % (cap) for cap in all_capsules_clean]
    output = ' | '.join(pieces)
    caps = fill(output)
    print('The scraped capsules are:')
    print("_" * 40)
    print('\n')
    print('\n\n'.join(caps.split('|')))
    print('\n')

    ##############################################################################################
    print(
        'QUESTION 2, Part II: Pairwise Text Cosine Similarity Scores of Search Capsules'
    )
    print("-" * 70)
    print('\n')

    query_list = []
    for i in range(len(all_capsules_clean)):
        query_list.append(
            Document(all_capsules_clean[i],
                     type=" ".join(all_capsules_clean[i].split())))

    m = Model(documents=query_list, weight=TFIDF)

    cos_similarities = [(m.similarity(x, y), m.documents[i].type,
                         m.documents[j].type)
                        for i, x in enumerate(m.documents)
                        for j, y in enumerate(m.documents) if i != j]

    unique_cos_sim = [
        tuple(x) for x in set(map(frozenset, cos_similarities))
        if len(tuple(x)) == 3
    ]

    resorted_cos_sim_caps = []
    for i in range(len(unique_cos_sim)):
        resorted_cos_sim_caps.append(
            sorted(tuple(str(e) for e in unique_cos_sim[i])))
        resorted_cos_sim_caps[i][0] = float(resorted_cos_sim_caps[i][0])
        resorted_cos_sim_caps[i] = tuple(resorted_cos_sim_caps[i])

    print(
        'The number of calculated capsule cosine similarity scores is: {} \n'.
        format(len(resorted_cos_sim_caps)))

    print(
        'All non-zero capsule cosine similarity scores, from smallest to largest: \n'
    )
    for tup in sorted(resorted_cos_sim_caps):
        if tup[0] != 0:
            print(tup[0])
    print('\n')

    print("~" * 70)

    #print with text wrapping
    format = '%s'

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_caps, key=lambda t: t[0], reverse=True)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The Cosine Similarity scores of the five most similar capsule pairs are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    print("~" * 70)

    pieces = [
        format % (sim, ) for sim in sorted(
            resorted_cos_sim_caps, key=lambda t: t[0], reverse=False)[:5]
    ]
    output = ' | '.join(pieces)
    sims = fill(output)
    print(
        'The Cosine Similarity scores of the five most dissimilar capsule pairs are: \n'
    )
    print('\n\n'.join(sims.split('|')))
    print('\n')

    print("~" * 70)

    print(
        'Finding the capsule with the highest cosine similarity to the original query... \n'
    )
    all_capsules_clean.append(orig_query)

    caps_and_query = []
    for i in range(len(all_capsules_clean)):
        caps_and_query.append(
            Document(all_capsules_clean[i],
                     type=" ".join(all_capsules_clean[i].split())))

    m = Model(documents=caps_and_query, weight=TFIDF)

    cos_similarities = [(m.similarity(x, y), m.documents[i].type,
                         m.documents[j].type)
                        for i, x in enumerate(m.documents)
                        for j, y in enumerate(m.documents) if i != j]

    unique_cos_sim_query = [
        tuple(x) for x in set(map(frozenset, cos_similarities))
        if len(tuple(x)) == 3
    ]

    resorted_cos_sim_query = []
    for i in range(len(unique_cos_sim_query)):
        resorted_cos_sim_query.append(
            sorted(tuple(str(e) for e in unique_cos_sim_query[i])))
        resorted_cos_sim_query[i][0] = float(resorted_cos_sim_query[i][0])
        resorted_cos_sim_query[i] = tuple(resorted_cos_sim_query[i])

    result_list = []
    for tup in resorted_cos_sim_query:
        if orig_query in tup:
            result_list.append(tup)

    result_tup = max(result_list, key=lambda x: x[0])
    print(
        'The cosine similarity score of the capsule most similar to the original query is: \n'
    )
    print(result_tup)
    print('\n')

    print(
        'Finding search ranking of the capsule with the highest cosine similarity to the original query... \n'
    )

    match_list = []
    for item in all_capsules_clean:
        match_list.append(item.replace('\n', ''))

    print(
        'The search ranking of the capsule most similar to the original query is: \n'
    )
    print(match_list.index(result_tup[1]))
    print('\n')

    #############################################################################################
    print(
        'QUESTION 2, Part III: Most Similar and Dissimilar Capsules and Search Rankings'
    )
    print("-" * 70)
    print('\n')

    print('The most similar pair of capsules is: \n')
    print(max(resorted_cos_sim_caps))
    print('\n')

    print('The most dissimilar pair of capsules is: \n')
    print(min(resorted_cos_sim_caps))
    print('\n')

    print("~" * 70)

    doc_types = [doc.type for doc in m.documents]

    print(
        'The search ranking of the first element of the most similar capsule pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_caps)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most similar capsule pair is: \n'
    )
    print(doc_types.index(max(resorted_cos_sim_caps)[2]))
    print('\n')

    print(
        'The search ranking of the first element of the most dissimilar capsule pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_caps)[1]))
    print('\n')

    print(
        'The search ranking of the second element of the most dissimilar capsule pair is: \n'
    )
    print(doc_types.index(min(resorted_cos_sim_caps)[2]))
    print('\n')

    ############################################################################################

    print('Summary Report: Document Similarity Semantic Analysis')
    print("-" * 70)
    ################
    report = "A crawler with changing user-agent headers was used to scrape book titles on Amazon from the first two pages of results returned when searching the philosopher, Martin Heidegger. Using TF-IDF values derived from a model incorporating the scraped results, all pairwise cosine similarity scores were calculated for the corpus documents, each of which consisted of the book title and any accompanying subtitle text. The scores were filtered for unique book title pairs and sorted by ascending cosine similarity score, so the top 5 and bottom 5 pairs could be printed in terminal. As several pairings returned a cosine similarity score of 0, the most dissimilar pair among the lowest scores could not be decisively quantified. Interestingly, search rankings of the elements of the most similar and dissimilar pairs did not appear on the same page of results. Another crawler was used to scrape capsules returned by a Google search for one of the book titles appearing in the Amazon results. Capsules from the first three pages of Google results were Unicode normalized and decoded before they were incorporated into another model, from which TF-IDF values were derived. All pairwise cosine similarity scores were calculated for the new set of corpus documents, which consisted of all text appearing in each capsule. Scores were filtered for unique capsule pairs and sorted by ascending cosine similarity score; the top 5 and bottom 5 pairs were again printed in terminal. To identify the capsule most similar to the original query, the latter was then included in the model, from which a new set of TF-IDF values and cosine similarity scores were generated. Interestingly, the ranking of the most similar capsule appeared lower in the search results than expected, on the bottom of the second page. Intuitively, the search rankings of the capsules most similar to one another did, however, appear on the same page of Google results."
    ##############
    format = '%s'
    pieces = [format % (word) for word in report]
    output = ''.join(pieces)
    write_up = fill(output)
    print(write_up)

    return None
示例#29
0
# and filters out noise, so that semantically related words come out stronger. 

# We'll use the Pang & Lee corpus of movie reviews, included in the testing suite.
# Take 250 positive reviews and 250 negative reviews:
data = os.path.join("..","..","test", "corpora", "polarity-en-pang&lee.csv")
data = Datasheet.load(data)
data = data[:250] + data[-250:]

# Build a model of movie reviews.
# Each document consists of the top 40 words in the movie review.
documents = []
for score, review in data:
    document = Document(review, stopwords=False, top=40, type=int(score) > 0)
    documents.append(document)

m = Model(documents)

print "number of documents:", len(m)
print "number of features:", len(m.vector)
print "number of features (average):", sum(len(d.features) for d in m.documents) / float(len(m))
print

# 6,337 different features may be too slow for some algorithms (e.g., hierarchical clustering).
# We'll reduce the document vectors to 10 concepts.

# Let's test how our model performs as a classifier.
# A document can have a label (or type, or class).
# For example, in the movie reviews corpus,
# there are positive reviews (score > 0) and negative reviews (score < 0).
# A classifier uses a model as "training" data
# to predict the label (type/class) of unlabeled documents.
示例#30
0

con = pymongo.MongoClient()
sentiment_res = con.tweets.sentiment_analysis
sentiment_res_p = con.tweets.patterns_sentiment_analysis
tweets = con.tweets.tweets_toronto

docs = []
# with open('D:\\data\\documents.spkl', 'wb') as fp:
#     for tweet in tweets.find():
#         doc = Document(tweet['text'],name=tweet['id'])
#         pickle.dump(doc, fp)
#     fp.close()
#

m = Model(documents=[],weight=TFIDF)

with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in range(tweets.count()/100):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in xrange(tweets.count()):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
    print len(m.documents)
m.reduce(dimensions=L2)
m.save
示例#31
0
def load_model(filename):
    '''
    Given a path/filename, load the Pattern.Vector model from that filename.
    '''
    print "Loading model from file {}".format(filename)
    return Model.load(filename)
示例#32
0
news, url = {}, 'http://news.google.com/news?output=rss'

for story in Newsfeed().search(url, cached=False):

    d = str(date(story.date, format='%Y-%m-%d'))
    s = plaintext(story.description)

    # Each key in the news dictionary is a date: news is grouped per day.
    # Each value is a dictionary of id => story items.
    # We use hash(story.description) as a unique id to avoid duplicate content.

    news.setdefault(d, {})[hash(s)] = s

# Your code will probably have some preprocessing steps to save and load the mined news updates.

m = Model()

for date, stories in news.items():
    s = stories.values()
    s = ' '.join(s).lower()

    # Each day of news is a single document.
    # By adding all documents to a model we can calculate tf-idf.

    m.append(Document(s, stemmer=LEMMA, exclude=['news', 'day'], name=date))

for document in m:

    print document.name
    print document.keywords(top=10)
示例#33
0
def get_results(query, quantity, force=False, news=False, analysis=True):
    query = query.lower()
    start = datetime.now()

    query = query.replace('_', '%20')
    breakdown = 50

    if breakdown > quantity:
        breakdown = quantity

    data_to_be_written = []
    knowledgeKeywords = []
    duplicates = []

    results, created = webSearch.objects.get_or_create(queryText=query.strip())
    if created or force or len(results.results.all()) < quantity:
        all_results = getGoogleResults(query, quantity, news, force)
    else:
        all_results = []

    if len(all_results) == 0 and not created:
        all_results = [r.url for r in results.results.all()]

    all_results = all_results[:quantity]
    print "TOTAL RESULTS ", str(len(all_results))
    # Done with getting search results

    for index, i in enumerate(all_results):
        try:
            wr, created = WebResource.objects.get_or_create(url=i)
            if created:
                wr = parseURL(i, True)
            data = {'url': i}
            keywords = [
                w for w in count(wr.text, top=10, stemmer=LEMMA)
                if w not in stop
            ]

            if 'books.google' in i:
                text = ''
            else:
                text = wr.text

            data.update({
                'keywords': keywords,
                'text': plaintext(text),
                'title': wr.title,
                'urls': wr.urls,
                'type': 'result',
                'index': index + 1,
                'similar': [],
                'duplicates': [],
                'category': 0,
            })

            if wr not in results.results.all():
                results.results.add(wr)

            data['plaintext'] = data['text'].split('\n')

            # while '' in data['plaintext']:
            # 	data['plaintext'].remove('')

            # knowledgeKeywords.extend(data['keywords'])

            data_to_be_written.append(data)
        except Exception as e:
            print e

    print "Response Result model Prepared"

    if not analysis:
        return data_to_be_written

    list_of_sim_docs, model, m = find_similarity(data_to_be_written)
    for i in list_of_sim_docs:
        similar = {
            'type': 'similar',
            's': i.get('source'),
            'd': i.get('dest'),
            'source': i.get('source'),
            'dest': i.get('dest'),
            'score': i.get('score'),
        }
        data_to_be_written.append(similar)

        if similar['score'] > 0.9:
            for res in data_to_be_written:
                if res['type'] in [
                        'result', 'duplicate'
                ] and res['url'] == i.get('dest') and len(res['text']) > 0:
                    print "Duplicate [{0}].[{1}]".format(
                        i['source'][:20], i['dest'][:20])
                    res['type'] = 'duplicate'

    items = [
        Document(i.get('text'),
                 name=i.get('url'),
                 description=i.get('index'),
                 stemmer=LEMMA) for i in data_to_be_written
    ]
    m = Model(items, weight=TFIDF)

    # k = 10
    ####### BEGIN Experimental Setup ##########

    # v,d = m.features, m.documents
    # y,x = len(m.documents),len(m.features)

    def build_matrix(w=None, d=None):
        y, x = len(d), len(w)
        model = np.zeros((y, x))

        for i in range(y):
            model[i] = [1 if w[j] in d[i].words else 0 for j in range(x)]

        return model

    # def find_word_matches(model, words = None, d = None):
    # 	y,x = model.shape
    # 	for i in range(y):
    # 		for j in range(i+1,y):
    # 			a = np.copy(model[i])
    # 			b = np.copy(model[j])

    # 			a_ones = np.count_nonzero(a)
    # 			b_ones = np.count_nonzero(b)

    # 			comparison = (a==b)

    # 			cross_product = a*b
    # 			intersection = np.count_nonzero(cross_product)
    # 			union = a_ones+b_ones-intersection

    # 			if a_ones+b_ones>0 and intersection > 0:
    # 				score = intersection/union
    # 			else:
    # 				score = 0

    # 			if model[i].any() and model[j].any() and comparison.any() and score > 0.4:
    # 				print "Match [{0}] {1}:[{2} words] - [{3}] {4}:[{5} words] : {6} words".format(d[i].description,d[i].name[:30], np.count_nonzero(a), d[j].description,d[j].name[:30], np.count_nonzero(b), score, math.fabs(d[i].description - d[j].description))
    # 				similar = {
    # 					'type' : 'similar',
    # 					'source' : d[i].name,
    # 					'dest' : d[j].name,
    # 					'score' : score,
    # 				}
    # 				data_to_be_written.append(similar)

    # 			if score >= 0.9:
    # 				for res in data_to_be_written:
    # 					if res['type'] in ['result','duplicate'] and res['url'] == d[j].name and len(res['text'])>0:
    # 						print "Duplicate [{0}].[{1}]".format(i+1,j+1)
    # 						res['type'] = 'duplicate'
    # 	return model

    def word_frequency(model,
                       words=None,
                       documents=None,
                       threshold1=0,
                       threshold2=1,
                       transpose=False):
        "Returns frequent word amoung documents in range of threshold"
        y, x = model.shape
        data = {}

        for i in range(x):
            count = np.count_nonzero(model[:, i]) / y
            if count >= threshold1 and count <= threshold2:
                if words:
                    data[words[i]] = count
                else:
                    data[i] = count
        return data

    model = build_matrix(m.features, m.documents)
    # model = find_word_matches(model, m.features, m.documents)
    knowledgeKeywords = [
        w for w in word_frequency(model, m.features, m.documents, 0.2, 0.8)
    ][:20]

    ####### END Experimental Setup ##########

    # c = m.cluster(method=HIERARCHICAL, k=k)
    # for i in c:
    # 	cluster = []
    # 	k = []
    # 	contains_text = False

    # 	for item in i:
    # 		for data in data_to_be_written:
    # 			if data.get('type') == 'result' and data.get('url')==item.name:
    # 				cluster.append({
    # 					'url' : data.get('url'),
    # 					'index' : item.description,
    # 					})
    # 				if data.get('text'):
    # 					k.extend([w for w in count(words(data.get('text')), top=50, stemmer = PORTER, exclude=[], stopwords=False, language='en')])
    # 					contains_text=True
    # 	cluster = {
    # 		'type' : 'cluster',
    # 		'data' : cluster,
    # 		'index' : min([c.get('index') for c in cluster] + [0]),
    # 		'keywords' : [w for w in count(k, top=10, stemmer = PORTER, exclude=[], stopwords=False, language='en')]
    # 	}

    # 	cluster['contains_text'] = contains_text

    # 	data_to_be_written.append(cluster)

    # print "{0} results".format(len(data_to_be_written))
    data_to_be_written.append({
        'type': 'meta',
        'keywords': knowledgeKeywords,
    })

    result = {}
    for i in data_to_be_written:
        if i.get('type') in ['result', 'duplicate']:
            url = i.get('url')
            index = int(i.get('index'))

            result[index] = [
                1 for r in data_to_be_written
                if r.get('type') == 'similar' and r['source'] == url
            ]

    result2 = [i for i, j in result.iteritems()]
    result3 = [len(j) for i, j in result.iteritems()]

    Process(target=plot_graph, args=(result2, result3)).start()

    return data_to_be_written
 def crearModelo(self, listaDocumentos):
     '''Crear modelo de listas de documentos utilizando calculo de frencuencias TFIDF'''
     return Model(listaDocumentos, weight=TFIDF)
示例#35
0
import cPickle as pickle

con = pymongo.MongoClient()
sentiment_res = con.tweets.sentiment_analysis
sentiment_res_p = con.tweets.patterns_sentiment_analysis
tweets = con.tweets.tweets_toronto

docs = []
# with open('D:\\data\\documents.spkl', 'wb') as fp:
#     for tweet in tweets.find():
#         doc = Document(tweet['text'],name=tweet['id'])
#         pickle.dump(doc, fp)
#     fp.close()
#

m = Model(documents=[], weight=TFIDF)

with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in range(tweets.count() / 100):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
with open('D:\\data\\documents.spkl', 'rb') as fp:
    for j in xrange(tweets.count()):
        print 'Loading model'
        m.append(pickle.load(fp))
        print len(m.documents)
    print len(m.documents)
m.reduce(dimensions=L2)
m.save
示例#36
0
r2_list = []
for query in r2_queries:
    for r2 in r2_exhibits.get_by_example(query):
        try:
            strings = [r2['program_desc']]
            projects = [r2['projects'] for k in r2['projects'].keys()]
            for proj in projects:
                try:
                    strings.append(proj['mission_desc'])
                except KeyError as e:
                    pass
            doc = Document(" ".join(strings), name=r2['_id'])
            r2_list.append(doc)
        except KeyError as e:
            print repr(e) # not much to do about this
m = Model(r2_list)

def r2iterator_to_model(collection, query):
    r2_list = []
    for r2 in collection.get_by_example(query):
        try:
            strings = [r2['program_desc']]
            for projid in r2['projects'].keys():
                try:
                    strings.append(r2['projects'][projid]['mission_desc'])
                except KeyError:
                    pass
            try:
                doc = Document(" ".join(strings), name=r2['_id'])
                r2_list.append(doc)
            except TypeError as e:
class ClusterLSI(object):
	def __init__(self):
		"""Setting up ClusterLSI environment
		"""
		
		
		self.field = "scopeAndContent"
		self.limit = False
		self.identifier = "idDoc"
		
		self.model = False
		self.cluster = False
		
		self.depth = 0
		
		self.outputNodes = "./clus-nodes.csv"
		self.outputEdges = "./clus-edges.csv"
	
	
	def normalize(self, s):
		"""Normalize a string
		
		Keyword arguments:
		s	---	string
		
		"""
		if type(s) == unicode: 
			return s.encode('utf8', 'ignore')
		else:
			return str(s)

	def modeling(self, descriptions, field = False, limit = False):
		"""Model returns a pattern.vector.Model object which is a list of pattern.vector.Document using Ehri.Get() descriptions
		Keyword arguments:
		descriptions ---	EHRI.get() description object
		field 	---		Field to look into, override defaut self.field
		limit	---		Debug option. Limit the model to $limit items
		"""
		if field:
			self.field = field
		if limit:
			self.limit = limit
		D = []
		
		#Creating Pattern Document element from data we got from Neo4J
		#
		
		#For debug reasons, we could set a limit
		if self.limit:
			i = 0
		for description in descriptions:
			D.append(Document(description[self.field], name=description[self.identifier]))
			#And stop the iteration when i reaches the limit
			if self.limit:
				i += 1
				if i == self.limit:
					break
		#Then, creating a model from our array
		self.model = Model(D)
		
		return self.model 
		
	def clusterize(self, model = False):
		"""Returns a cluster of given model
		
		Keyword arguments:
		model	---	If set, override instance model
		
		"""
		if model:
			self.model = model
		self.cluster = self.model.cluster(method=HIERARCHICAL, k=2)
		return self.cluster

	def flatten(self, array, typeOf = "str"):
		"""Returns a 1 dimension list with given type of item inside given array
		
		Keyword arguments:
		array	---	A list of items
		typeOf	---	Type of item the function should return
		
		"""
		#Flatten an array
		if typeOf == "str":
			return [element for element in array if isinstance(element, basestring)]
		elif typeOf == "list":
			return [element for element in array if isinstance(element, list)]

	def csv(self, array, parents = False, fake = 0):
		"""Return a tuple of csv string with given items and number of fake items
		
		Keyword arguments:
		array	---	A list of items
		parents	---	A list of parents
		fake	---	An index for fake parents 
		
		"""
		string = "" 
		#Making list of elements, avoid calling it once more
		currents = self.flatten(array, "str")
		children = self.flatten(array, "list")
		
		if len(currents) == 0:
			fake += 1
		Ffake = fake
		#If we have parents, we have parents connections
		if parents:
			for element in currents:
				for parent in parents:
					string += self.normalize(element) + ";" + parent + "\n"
		
		#Taking care of children
		for child in children:
			if len(currents) > 0:
				Sstring, Ffake = self.csv(child, currents, Ffake)
			else:
				Sstring, Ffake = self.csv(child, ["fake-"+str(fake)], Ffake)
			string += Sstring
				
			
		return string, Ffake
		

	def clusterToArray(self, Graph):
		"""Convert a cluster object to an array list with n-depth where depth is same as cluster.depth
		
		Keyword arguments:
		
		Graph	---	Cluster or list
		"""
		array = []
		
		Docs = [element for element in Graph if isinstance(element, pattern.vector.Document)]
		Clusts = [element for element in Graph if isinstance(element, list)]
		
		for node in Docs:
			array.append(node.name)
		for node in Clusts:
			array.append(self.clusterToArray(node))
		return array
	
	def save(self, descriptions, csv, fakes = 0, nodesName = False, edgesName = False ):
		"""Output cluster into csv files
		
		Keyword arguments:
		descriptions	---	EHRI.get() description item
		fakes	---	Number of fakes parents
		nodesName	---	Filename for Nodes's CSV file
		edgesName	---	Filename for Edges's CSV file
		
		"""
		if nodesName:
			self.outputNodes = nodesName
		if edgesName:
			self.outputEdges = edgesName
		
		
		f = open(self.outputNodes, "wt")
		f.write("id;label;type\n")
		for description in descriptions:
			f.write(self.normalize(description[self.identifier] + ";" + description[self.identifier] + ";1\n"))
		i=0
		while i <= fakes:
			f.write("fake-" + str(i) + ";" + "fake" + str(i) + ";0\n")
			i+= 1
		f.close()

		f = open(self.outputEdges, "wt")
		f.write("source;target\n");
		f.write(csv)
		f.close()






refineddata1 = [(features(c[0]),c[1]) for c in refineddata]


#Each datapoint becomes a pattern document here; The type represents the label for each document#
refineddata2 = [Document(message, type=sideeffectindicator) for message, sideeffectindicator in refineddata1]


#Defining the model using the documents; feature weight is Information gain; You can try changing to TF, TFIDF etc#
model = Model(documents=refineddata2, weight=IG)


#Top 500 features selected#
features=model.feature_selection(top=500)

#If medicine names are present they are removed #
refinedfeatures = []
for i in features:
    if i not in medlist:
        refinedfeatures.append(i)
        



示例#39
0
# -*- coding: utf-8 -*-

from json import load
from pattern.vector import Document, Model,L2

packages = load(file("packages.json"))

docs = [Document(p['description'], name=p['name']) for p in packages]
model = Model(docs)

lsa = model.reduce(L2)
示例#40
0
from pattern.vector import Document, Model

d1 = Document('The cat purrs.', name='cat1')
d2 = Document('Curiosity killed the cat.', name='cat2')
d3 = Document('The dog wags his tail.', name='dog1')
d4 = Document('The dog is happy.', name='dog2')

m = Model([d1, d2, d3, d4])
m.reduce(2)
 
for d in m.documents:
    print
    print d.name
    for concept, w1 in m.lsa.vectors[d.id].items():
        for feature, w2 in m.lsa.concepts[concept].items():
            if w1!=0 and w2!=0:
                print (feature, w1 * w2)
# the weights will be between 0.0-1.0 (their sum is 1.0).
print document.copy()
# document vector
v1 = Vector({"curiosity": 1, "kill": 1, "cat": 1})
v2 = Vector({"curiosity": 1, "explore": 1, "mars": 1})
print 1 - distance(v1, v2)
# model
d1 = Document('A tiger is a big yellow cat with stripes.', type='tiger')
d2 = Document(
    'A lion is a big yellow cat with manes.',
    type='lion',
)
d3 = Document('An elephant is a big grey animal with a slurf.',
              type='elephant')
print d1.vector
m = Model(documents=[d1, d2, d3], weight=TFIDF)
print d1.vector
print m.similarity(d1, d2)  # tiger vs. lion
print m.similarity(d1, d3)  # tiger vs. elephant
# lsa concept space
d1 = Document('The cat purrs.', name='cat1')
d2 = Document('Curiosity killed the cat.', name='cat2')
d3 = Document('The dog wags his tail.', name='dog1')
d4 = Document('The dog is happy.', name='dog2')
m = Model([d1, d2, d3, d4])
m.reduce(2)
for d in m.documents:
    print
    print d.name
    for concept, w1 in m.lsa.vectors[d.id].items():
        for feature, w2 in m.lsa.concepts[concept].items():
示例#42
0
def loadTFIDF():
  """Load the pickle file created by run TFIDF"""

  return Model.load("project/pickle/course.pic")
def initializeModel():
    classifierModel = Model.load('classificationModel.slp')
    return classifierModel
示例#44
0
from pattern.search import search
from pattern.vector import Document, Model, KNN

# Classification is a supervised machine learning method,
# where labeled documents are used as training material
# to learn how to label unlabeled documents.

# This example trains a simple classifier with Twitter messages.
# The idea is that, if you have a number of texts with a "type"
# (mail/spam, positive/negative, language, author's age, ...),
# you can predict the type of other "unknown" texts.
# The k-Nearest Neighbor algorithm classifies texts according
# to the k documents that are most similar (cosine similarity) to the
# given input document.

m = Model()
t = Twitter()

# First, we mine a model of a 1000 tweets.
# We'll use hashtags as type.
for page in range(1, 10):
    for tweet in t.search('#win OR #fail', start=page, count=100, cached=True):
        # If the tweet contains #win hashtag, we'll set its type to 'WIN':
        s = tweet.text.lower()               # tweet in lowercase
        p = '#win' in s and 'WIN' or 'FAIL'  # document labels
        # parse tree with part-of-speech tags
        s = Sentence(parse(s))
        s = search('JJ', s)                  # adjectives in the tweet
        s = [match[0].string for match in s]  # adjectives as a list of strings
        s = " ".join(s)                      # adjectives as string
        if len(s) > 0:
示例#45
0
# to represent this.

# A Model is a collection of documents vectors.
# A Model is a matrix (or vector space)
# with features as columns and feature weights as rows.
# We can then do calculations on the matrix,
# for example to compute TF-IDF or similarity between documents.

# Load a model from a folder of text documents:
documents = []
for f in glob.glob(os.path.join(os.path.dirname(__file__), "corpus", "*.txt")):
    text = codecs.open(f, encoding="utf-8").read()
    name = os.path.basename(f)[:-4]
    documents.append(Document(text, name=name))

m = Model(documents, weight=TFIDF)

# We can retrieve documents by name:
d = m.document(name="lion")

print(d.keywords(top=10))
print()
print(d.tf("food"))
# TF-IDF is less: "food" is also mentioned with the other animals.
print(d.tfidf("food"))
print()

# We can compare how similar two documents are.
# This is done by calculating the distance between the document vectors
# (i.e., finding those that are near to each other).
        # per comment to the document
        all_entry_comment_text_filtered += len(entry_comments) * \
                                           " xxludumscrapecommentcounterxx "

    #print(all_entry_comment_text_filtered)
    # A 'document' is a bag of words from all comments for one game
    # entry (seems to work better grouping all comments), associated with
    # it's rating or classification (eg type=output_vector).
    documents.append(Document(all_entry_comment_text_filtered,
                              name="%s\t%s" % (author, url),
                              type=output_vector,
                              stopwords=True))

vectors = []
if use_feature_selection:
    vectors = Model(documents=documents, weight=pattern.vector.TFIDF)
    vectors = vectors.filter(
        features=vectors.feature_selection(top=select_top_n_features))
    #print(vectors.vectors)
else:
    vectors = documents

if options["train"]:
    if classifier_type == "SVM":
        classifier = SVM(train=vectors,
                         type=svm_type,
                         kernel=svm_kernel)
    else:
        classifier = getattr(pattern.vector, classifier_type)(train=vectors)

    print("Classes: " + repr(classifier.classes))
示例#47
0
# to represent this.

# A Model is a collection of documents vectors.
# A Model is a matrix (or vector space)
# with features as columns and feature weights as rows.
# We can then do calculations on the matrix,
# for example to compute TF-IDF or similarity between documents.

# Load a model from a folder of text documents:
documents = []
for f in glob.glob(os.path.join(os.path.dirname(__file__), "corpus", "*.txt")):
    text = codecs.open(f, encoding="utf-8").read()
    name = os.path.basename(f)[:-4]
    documents.append(Document(text, name=name))

m = Model(documents, weight=TFIDF)

# We can retrieve documents by name:
d = m.document(name="lion")

print d.keywords(top=10)
print
print d.tf("food")
print d.tfidf(
    "food")  # TF-IDF is less: "food" is also mentioned with the other animals.
print

# We can compare how similar two documents are.
# This is done by calculating the distance between the document vectors
# (i.e., finding those that are near to each other).
def load_model(filename):
    '''
    Given a path/filename, load the Pattern.Vector model from that filename.
    '''
    print "Loading model from file {}".format(filename)
    return Model.load(filename)
示例#49
0
文件: 03-lsa.py 项目: Abhishek-1/temp
# We'll use the Pang & Lee corpus of movie reviews, included in the testing suite.
# Take 250 positive reviews and 250 negative reviews:
data = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora",
                    "polarity-en-pang&lee1.csv")
data = Datasheet.load(data)
data = data[:250] + data[-250:]

# Build a model of movie reviews.
# Each document consists of the top 40 words in the movie review.
documents = []
for score, review in data:
    document = Document(review, stopwords=False, top=40, type=int(score) > 0)
    documents.append(document)

m = Model(documents)

print("number of documents:", len(m))
print("number of features:", len(m.vector))
print("number of features (average):",
      sum(len(d.features) for d in m.documents) / float(len(m)))
print()

# 6,337 different features may be too slow for some algorithms (e.g., hierarchical clustering).
# We'll reduce the document vectors to 10 concepts.

# Let's test how our model performs as a classifier.
# A document can have a label (or type, or class).
# For example, in the movie reviews corpus,
# there are positive reviews (score > 0) and negative reviews (score < 0).
# A classifier uses a model as "training" data
示例#50
0
def recommend_game(this_game):
    games = recommendable_games(this_game)

    total_recommendable = games.count()
    print 'Total recommendable games based on ' + this_game.title + ": " + total_recommendable.__str__()

    document_title = Document(this_game.title)
    document_publisher = Document(this_game.publisher)
    document_summary = Document(this_game.summary,
                                top=None,
                                threshold=0,
                                stemmer=None,
                                exclude=[],
                                stopwords=False,
                                language='en')
    document_keywords = Document(', '.join([x['name'] for x in this_game.keywords.all().values("name")]))
    document_genres = Document(', '.join([x['name'] for x in this_game.genres.all().values("name")]))

    # format: {"id":id, socre:"SUM(dist*pond)"}
    game_similarities = []
    summary_documents = []
    for game in games:
        score = 0
        game = Game.objects.filter(title=game['title'], platform=game['platform'])[0]

        title_similarity = 1 - distance(document_title.vector, Document(game.title).vector)
        publisher_similarity = 1 - distance(document_publisher.vector, Document(game.publisher).vector)
        genre_similarity = 1 - distance(document_genres.vector, Document(
            ', '.join([x['name'] for x in game.genres.all().values("name")])
        ).vector)
        keywords_similarity = 1 - distance(document_keywords.vector, Document(
            ', '.join([x['name'] for x in game.keywords.all().values("name")])
        ).vector)

        score = (0.15 * title_similarity) + (0.2 * genre_similarity) + (0.2 * publisher_similarity) + (
            0.20 * keywords_similarity)

        summary_documents.append(Document(game.summary,
                                          top=None,
                                          threshold=0,
                                          stemmer=None,
                                          exclude=[],
                                          stopwords=False,
                                          language='en',
                                          name=game.id))

        game_similarities.append({"id": game.id, "score": score})

    to_compare = Document(document_summary)

    model = Model(documents=summary_documents, weight=TFIDF)

    neighbours = model.neighbors(to_compare, top=total_recommendable)

    for neighbour in neighbours:
        for rec_game in game_similarities:
            if rec_game['id'] == neighbour[1].name:
                rec_game['score'] = rec_game['score'] + 0.25 * neighbour[0]

    recommended = sorted(game_similarities, key=lambda k: -k['score'])[0:total_recommendable]

    if len(recommended) >= 40:
        random_selection = random.sample(recommended[0:40], 25)
    else:
        random_selection = random.sample(recommended, 25)

    recommended_ids = [g['id'] for g in random_selection]

    return recommended_ids
示例#51
0
# but is is still popular because it is fast for models
# that have many documents and many features.
# It is outperformed by KNN and SVM, but useful as a baseline for tests.

# We'll test it with a corpus of spam e-mail messages,
# included in the test suite, stored as a CSV-file.
# The corpus contains mostly technical e-mail from developer mailing lists.
data = os.path.join(os.path.dirname(__file__), "..", "..", "test", "corpora",
                    "spam-apache.csv")
data = Datasheet.load(data)

documents = []
for score, message in data:
    document = Document(message, type=int(score) > 0)
    documents.append(document)
m = Model(documents)

print("number of documents:", len(m))
print("number of words:", len(m.vector))
print("number of words (average):",
      sum(len(d.features) for d in m.documents) / float(len(m)))
print()

# Train Naive Bayes on all documents.
# Each document has a type: True for actual e-mail, False for spam.
# This results in a "binary" classifier that either answers True or False
# for unknown documents.
classifier = NB()
for document in m:
    classifier.train(document)
示例#52
0
from pattern.vector import Document, Model, IG, TF, TFIDF, BINARY
import sys
import os

print "Reading sample code and instantiating documents..."
documents = []
exampleDir = "examples/"
for file in os.listdir(exampleDir):
    if os.path.isdir(exampleDir + file):
        for subfile in os.listdir(exampleDir + file):
            if (os.path.isfile(exampleDir + file + "/" + subfile)):
                with open (exampleDir + file + "/" + subfile, "r") as langDoc:
                    text = langDoc.read()
                    doc = Document(text, type=file)
                    documents.append(doc)

print "Creating statistical model..."
m = Model(documents=documents, weight=IG)

# Test with sample Java doc
print "Comparing test document..."
with open ("coffee.txt", "r") as myfile:
    testFile = myfile.read()
testDoc = Document(testFile, type='Java')
testSimilarities = m.neighbors(testDoc, top=10)
prediction = testSimilarities[0][1].type #neighbors() returns (similarity, document) list
confidence = testSimilarities[0][0]
print "LanguageLearn has predicted " + testSimilarities[0][1].type + " with a " + str(round(confidence * 100, 2)) + "% confidence"