Exemplo n.º 1
0
import os

from corpus.textExtractor import TextExtractor
from corpus.bowBuilder import BowBuilder
from lda.ldaCalc import LdaCalc

cache_root = os.path.join(os.getcwd(), "cache")

# start here!
# get text
print("\nextracting text from corpus ------------------------------------\n")
extractor = TextExtractor(cache_root)
extractor.get_texts()
# extractor.save()

# convert to BOW vectors
print("\nbuilding BOW vectors from corpus ------------------------------------\n")
bowBuilder = BowBuilder(docs=extractor.texts, cache_dir=cache_root)
bowBuilder.generate_bows()
bowBuilder.save()

# run the LDA
print("\ntraining LDA model -----------------------------------------------\n")
lda = LdaCalc(bowBuilder.bowVectorCorpus, bowBuilder.id2word, cache_root)
lda.run_lda()
print("\nsaving LDA model -----------------------------------------------\n")
lda.save()
lda.print_topics()


Exemplo n.º 2
0
    # topn_json = json.dumps(topn)
    # print str(topn_json)
    return json_list


if __name__ == "__main__":
    print("Loading BoW Vectors from cache...\n")
    vocab = Vocabulary(vocab_file)
    vocab.load_from_cache(reuters_cache_path)
    text_corpus = Text(text_file, vocab)
    text_corpus.load_from_cache(reuters_cache_path)

    print("Loading LDA model...\n")
    lda_model = LdaCalc(
        bows=text_corpus.bow_vectors,
        sims_cache_dir=sims_cache_path,
        lda_cache_dir=reuters_cache_path,
        num_topics=num_topics,
    )
    lda_model.load()
    lda_model.print_topics()

    # print("Indexing Model for similarities...")
    # index = gensim.similarities.MatrixSimilarity(lda_model.lda_model[text_corpus.bow_vectors])
    # doc = "Fear of contracting AIDS from women is prompting some men to turn to children for sex, the head of the United Nations' global AIDS agency told a conference on child sex abuse on Wednesday. \"The AIDS epidemic has become both a cause and a consequence of the trade in children,\" Peter Piot, executive director of UNAIDS, said in a speech. \"Men are looking out for younger girls because they are concerned that if they have sex with adult women then they are at risk for HIV infection,\" Piot told Reuters. Sex with younger partners as protection from HIV, the virus that causes AIDS, is an illusion, Piot told delegates from more than 100 countries on the second day of the first World Congress Against Commercial Sexual Exploitation of Children. Many child prostitutes were infected and young people are actually more susceptible to infection than adults, Piot said. \"Because of the physical disproportion between the partners, a child who is not fully grown is more easily torn or damaged by penetrative sex, and this makes it easier for the virus to pass into the child's body,\" Piot said in a speech at the conference. \"And a child can't fight back, no matter how rough the sex or how long it lasts,\" Piot added. Over 1,000 delegates have gathered in Stockholm for the five-day conference to discuss the scope of the problems, legal reform, and raising public awareness. More than one million children worldwide are reportedly forced into child prostitution, trafficked and sold for sexual purposes and used in the production of child pornography, according to UNICEF figures. About one million children are currently HIV positive or have AIDS. Most contracted the disease from their infected mothers, Piot said. Over two million children had already died from the disease, he said. Statistics showing the rate of HIV infection among child prostitutes were unavailable, but Piot said that very small samples indicated that as many as 50 percent of underage sex workers could have the virus. The conference is jointly organised by the Swedish government, the United Nations Childrens Fund (UNICEF), pressure group ECPAT (End Child Prostitution in Asia Tourism) and the NGO group on the rights of the child. While promoting condom use could curb the spread of the HIV virus among underage sex workers, Piot called for broader, urgent measures from governments and communities to end the sexual trade in children. \"Children are weak, vulnerable and uninformed, and they are scarcely in a position to demand that the client should use a condom,\" Piot said. \"Through income-generation, promotion of rural industry and education policies, governments can reinforce families' resistance to the lure of commercial gain through the sale of their children,\" Piot said as one example. Since the start of the conference, as if to underline how widespread the issue is, horrifying abuse and paedophile cases have come to light in Albania, Australia, Belgium and Finland. Finnish police said on Wednesday they had discovered in a Helsinki flat a massive computer library of exceptionally severe child pornography including pictures of mutilated people and cannibalism. Police had taken two computers and nearly 350 floppy disks from the home of a 19-year-old student, but could not arrest him because they do not have the powers under Finnish law. In Belgium police on Wednesday were digging for human remains at a property owned by Marc Dutroux, chief suspect in a child sex and kidnapping ring. Dutroux had already led police to the bodies of eight-year-olds Melissa Russo and Julie Lejeune and of Weinstein 10 days ago. They were buried in the garden of one of Dutroux' five other houses in and around the city of Charleroi. Also this week, a 75-year-old Australian man appeared in court charged with 850 child sex crimes, including indecent dealing, sodomy and permitting sodomy with children."
    # vec_bow = lda_model.id2word.doc2bow(doc.lower().split())
    # vec_lda = lda_model.lda_model[vec_bow]
    # sims = index[vec_lda]
    # sims = sorted(enumerate(sims), key=lambda item: -item[1])
    # print(sims)

    print("\nStarting Server...")
Exemplo n.º 3
0
__author__ = 'thomas'

import os
from corpus.bowBuilder import BowBuilder

from lda.ldaCalc import LdaCalc

cache_root = os.path.join(os.getcwd(), "cache")

bowBuilder = BowBuilder(cache_dir=cache_root)
bowBuilder.load()
bows = bowBuilder.bowVectorCorpus

print("\nLoading LDA model -----------------------------------------------\n")
lda = LdaCalc(bows=bows, sims_cache_dir=cache_root)
lda.load()
lda.print_topics()

lda.calc_sims()
# lda.save_sims()