예제 #1
0
collection = mydb['doccat']
DOCUMENT_FOLDER = 'C://Users//champ//Documents//watchdocgit//Testdocuments//6451to100204'
textrazor.api_key = "0db9955a7bd9f0d9ac9f96a28c0093123b4546dd3bfff2cfd6f0f505"
client = textrazor.TextRazor(extractors=["entities", "topics"])
client.set_classifiers(["textrazor_newscodes"])
for filename in os.listdir(DOCUMENT_FOLDER):
    fileProcessed = 0
    selectedFile = open(DOCUMENT_FOLDER + "/" + filename, "r")
    selectedFilePath = str(DOCUMENT_FOLDER + "/" + filename)
    print(filename)
    finalcat = []
    finalscore = []
    input_file = file(selectedFilePath).read().decode("utf-8")
    startLines = input_file[0:100]
    #print(startLines)
    response = client.analyze(input_file)
    entities = list(response.entities())
    entities.sort(key=lambda x: x.relevance_score, reverse=True)
    seen = set()
    keywords = list()
    info = list()
    for entity in entities:
        if entity.id not in seen:
            # print (entity.id, entity.relevance_score, entity.confidence_score, entity.freebase_types)
            seen.add(entity.id)
            keywords.append(entity.id)
    mydb.keywords.insert({"keywords": keywords, "name": filename})
    print("--------------------------------------------")
    topiclist = list()
    for topic in response.topics():