示例#1
0
def getTagsFromText():
    input_text = request.args.get('text')
    if input_text is None:
        return "Please give a 'text' parameter to analyse."
    data = {}
    result_category = similarityTopic(input_text)

    tagDict = {}
    for i in range(25):
        tagDict = test_article(tagDict, i)

    # testfile = "Similarity/workfile.txt"
    token_processor = TokenProcessor()
    er = EventRegistry(apiKey='ab40eb06-3900-4689-a369-b4098f4e49ef')
    # doc = datautils.get_test_document(testfile, token_processor)

    # file1 = open(testfile,'r')
    text = input_text

    analytics = Analytics(er)
    ann = analytics.annotate(text)
    annotations = []
    parsed_json = json.loads(json.dumps(ann))
    for annotation in parsed_json[u'annotations']:
        annotations.append(annotation[u'title'])
    my_list = list(itertools.chain(*tagDict.values()))
    my_list = my_list + annotations

    data['text_category'] = str(result_category)
    data['encoding'] = 'utf-8'
    data['tags'] = dict(pd.value_counts(my_list))
    json_data = json.dumps(data)
    return json_data
示例#2
0
def get_test_document(filename, token_processor):
    if not token_processor:
        token_processor = TokenProcessor()
    doc = Document(filename)
    doc.load_from_file(filename)
    doc.extract_terms(token_processor)
    doc.generate_frequency_map()
    return doc
示例#3
0
def main():
    token_processor = TokenProcessor()

    doc = Document(1)
    # doc.load_from_file()
    doc.extract_terms(token_processor)
    doc.generate_frequency_map()
    print(doc.get_frequent_terms())
示例#4
0
def get_train_documents(documents_path, token_processor):
    if not token_processor:
        token_processor = TokenProcessor()
    documents = []
    filenames = glob.glob(documents_path)

    bagWidth = int(0.2 * len(filenames))
    docsInBag = random.sample(range(len(filenames) - 1), bagWidth)

    for i in docsInBag:
        #print i
        doc = Document(i)
        doc.load_from_file(filenames[i])
        doc.extract_terms(token_processor)
        doc.generate_frequency_map()
        documents.append(doc)
    return documents
示例#5
0
def main():
    tokenizer = TokenProcessor()

    doc = Document(1)
    doc.load_from_file("data/test")
    doc.extract_terms(tokenizer)
    doc.generate_frequency_map()
    print(doc)

    doc1 = Document(2)
    doc1.load_from_file("data/test2.txt")
    doc1.extract_terms(tokenizer)
    doc1.generate_frequency_map()
    print(doc1)

    tfidf = TFIDF()
    tf = tfidf.calculate_term_frequency(doc, "i")
    print(tf)
    idf = tfidf.calculate_inverse_document_frequency([doc, doc1], "i")
    print(idf)
示例#6
0
def main():
    tagDict = {}
    for i in range(25):
        tagDict = test_article(tagDict, i)

    testfile = "Similarity/workfile.txt"
    token_processor = TokenProcessor()
    er = EventRegistry(apiKey='ab40eb06-3900-4689-a369-b4098f4e49ef')
    # doc = datautils.get_test_document(testfile, token_processor)

    file1 = open(testfile, 'r')
    text = file1.read()

    analytics = Analytics(er)
    ann = analytics.annotate(text)
    annotations = []
    parsed_json = json.loads(json.dumps(ann))
    for annotation in parsed_json[u'annotations']:
        annotations.append(annotation[u'title'])
    # print(annotations)
    my_list = list(itertools.chain(*tagDict.values()))
    my_list = my_list + annotations
    print dict(pd.value_counts(my_list))
示例#7
0
def test_article(tagDict, i):
    token_processor = TokenProcessor()
    testfile = "Similarity/workfile.txt"

    with open(r"Similarity/Result.pickle", "rb") as input_file:
        typeDict = cPickle.load(input_file)
    tipe = typeDict.keys()[0]
    #print tipe
    documents = datautils.get_train_documents(
        "inputdocuments/Tagged_Documents_2018-05-25_215336/" + tipe + "/*.txt",
        token_processor)

    doc = datautils.get_test_document(testfile, token_processor)

    tagger = Tagger()
    for document in documents:
        tagger.add_document(document)

    weighted_terms = tagger.get_terms_weighted_by_tfidf(doc)
    tags = tagger.get_tags_using_weighted_terms(weighted_terms)
    #print("Generated for the document are:\n{}".format(tags))
    tagDict[i] = tags
    return tagDict