def getTagsFromText(): input_text = request.args.get('text') if input_text is None: return "Please give a 'text' parameter to analyse." data = {} result_category = similarityTopic(input_text) tagDict = {} for i in range(25): tagDict = test_article(tagDict, i) # testfile = "Similarity/workfile.txt" token_processor = TokenProcessor() er = EventRegistry(apiKey='ab40eb06-3900-4689-a369-b4098f4e49ef') # doc = datautils.get_test_document(testfile, token_processor) # file1 = open(testfile,'r') text = input_text analytics = Analytics(er) ann = analytics.annotate(text) annotations = [] parsed_json = json.loads(json.dumps(ann)) for annotation in parsed_json[u'annotations']: annotations.append(annotation[u'title']) my_list = list(itertools.chain(*tagDict.values())) my_list = my_list + annotations data['text_category'] = str(result_category) data['encoding'] = 'utf-8' data['tags'] = dict(pd.value_counts(my_list)) json_data = json.dumps(data) return json_data
def get_test_document(filename, token_processor): if not token_processor: token_processor = TokenProcessor() doc = Document(filename) doc.load_from_file(filename) doc.extract_terms(token_processor) doc.generate_frequency_map() return doc
def main(): token_processor = TokenProcessor() doc = Document(1) # doc.load_from_file() doc.extract_terms(token_processor) doc.generate_frequency_map() print(doc.get_frequent_terms())
def get_train_documents(documents_path, token_processor): if not token_processor: token_processor = TokenProcessor() documents = [] filenames = glob.glob(documents_path) bagWidth = int(0.2 * len(filenames)) docsInBag = random.sample(range(len(filenames) - 1), bagWidth) for i in docsInBag: #print i doc = Document(i) doc.load_from_file(filenames[i]) doc.extract_terms(token_processor) doc.generate_frequency_map() documents.append(doc) return documents
def main(): tokenizer = TokenProcessor() doc = Document(1) doc.load_from_file("data/test") doc.extract_terms(tokenizer) doc.generate_frequency_map() print(doc) doc1 = Document(2) doc1.load_from_file("data/test2.txt") doc1.extract_terms(tokenizer) doc1.generate_frequency_map() print(doc1) tfidf = TFIDF() tf = tfidf.calculate_term_frequency(doc, "i") print(tf) idf = tfidf.calculate_inverse_document_frequency([doc, doc1], "i") print(idf)
def main(): tagDict = {} for i in range(25): tagDict = test_article(tagDict, i) testfile = "Similarity/workfile.txt" token_processor = TokenProcessor() er = EventRegistry(apiKey='ab40eb06-3900-4689-a369-b4098f4e49ef') # doc = datautils.get_test_document(testfile, token_processor) file1 = open(testfile, 'r') text = file1.read() analytics = Analytics(er) ann = analytics.annotate(text) annotations = [] parsed_json = json.loads(json.dumps(ann)) for annotation in parsed_json[u'annotations']: annotations.append(annotation[u'title']) # print(annotations) my_list = list(itertools.chain(*tagDict.values())) my_list = my_list + annotations print dict(pd.value_counts(my_list))
def test_article(tagDict, i): token_processor = TokenProcessor() testfile = "Similarity/workfile.txt" with open(r"Similarity/Result.pickle", "rb") as input_file: typeDict = cPickle.load(input_file) tipe = typeDict.keys()[0] #print tipe documents = datautils.get_train_documents( "inputdocuments/Tagged_Documents_2018-05-25_215336/" + tipe + "/*.txt", token_processor) doc = datautils.get_test_document(testfile, token_processor) tagger = Tagger() for document in documents: tagger.add_document(document) weighted_terms = tagger.get_terms_weighted_by_tfidf(doc) tags = tagger.get_tags_using_weighted_terms(weighted_terms) #print("Generated for the document are:\n{}".format(tags)) tagDict[i] = tags return tagDict