class TestClientTextApi(unittest.TestCase): def setUp(self): self.api = TextApi(testConfiguration.client) def testText(self): fpList = self.api.getRepresentationForText( testConfiguration.RETINA_NAME, inputText) fp = fpList[0] self.assertEqual(len(fpList), 1) self.assertNotEqual(fp, None) self.assertGreater(len(fp.positions), 500) def testKeywords(self): termList = self.api.getKeywordsForText(testConfiguration.RETINA_NAME, inputText) self.assertGreater(len(termList), 2) self.assertTrue(isinstance(termList[0], unicode)) def testTokenize(self): tokens = self.api.getTokensForText(testConfiguration.RETINA_NAME, inputText) self.assertNotEquals(len(tokens), 0) self.assertTrue(isinstance(tokens[0], unicode)) self.assertEquals(tokens[0].split(',')[0], "george") def testSlices(self): texts = self.api.getSlicesForText(testConfiguration.RETINA_NAME, inputText, True, 0, 2) self.assertEqual(len(texts), 2) self.assertEqual(texts[0].text.split(' ')[0], "George") self.assertGreater(len(texts[0].fingerprint.positions), 100) def testBulk(self): fingerprints = self.api.getRepresentationsForBulkText( testConfiguration.RETINA_NAME, bulkText, 1.0) self.assertEqual(len(fingerprints), 6) for fp in fingerprints: self.assertGreater(len(fp.positions), 100) def testLanguageDetection(self): self.assertEqual( self.api.getLanguage("I have a dream!").language, "English") self.assertEqual( self.api.getLanguage("Ich bin ein").wiki_url, "http://en.wikipedia.org/wiki/German_language") self.assertEqual( self.api.getLanguage("Der var så dejligt ude på landet.").iso_tag, "da")
class TestClientTextApi(unittest.TestCase): def setUp(self): self.api = TextApi(testConfiguration.client) def testText(self): fpList = self.api.getRepresentationForText(testConfiguration.RETINA_NAME, inputText) fp = fpList[0] self.assertEqual(len(fpList), 1) self.assertNotEqual(fp, None) self.assertGreater(len(fp.positions), 500) def testKeywords(self): termList = self.api.getKeywordsForText(testConfiguration.RETINA_NAME, inputText) self.assertGreater(len(termList), 2) self.assertTrue(isinstance(termList[0], unicode)) def testTokenize(self): tokens = self.api.getTokensForText(testConfiguration.RETINA_NAME, inputText) self.assertNotEquals(len(tokens), 0) self.assertTrue(isinstance(tokens[0], unicode)) self.assertEquals(tokens[0].split(',')[0], "george") def testSlices(self): texts = self.api.getSlicesForText(testConfiguration.RETINA_NAME, inputText, True, 0, 2) self.assertEqual(len(texts), 2) self.assertEqual(texts[0].text.split(' ')[0], "George") self.assertGreater(len(texts[0].fingerprint.positions), 100) def testBulk(self): fingerprints = self.api.getRepresentationsForBulkText(testConfiguration.RETINA_NAME, bulkText, 1.0) self.assertEqual(len(fingerprints), 6) for fp in fingerprints: self.assertGreater(len(fp.positions), 100) def testLanguageDetection(self): self.assertEqual(self.api.getLanguage("I have a dream!").language, "English") self.assertEqual(self.api.getLanguage("Ich bin ein").wiki_url, "http://en.wikipedia.org/wiki/German_language") self.assertEqual(self.api.getLanguage("Der var så dejligt ude på landet.").iso_tag, "da")
body = myfile.read().replace('\n', '') #################################################### ######### Code for fingerprint (vector) ############ #################################################### # Chose either en_synonymous or en_associative retina text = TextApi(client).getRepresentationForText("en_synonymous", body) print text[0].positions #################################################### ############# Code for keywords list ############### #################################################### # Chose either en_synonymous or en_associative retina terms = TextApi(client).getKeywordsForText("en_synonymous", body) print terms ##################################################### ########## Code for fingerprint (image) ############# ##################################################### body = '{"text":"%s"}' % body # Chose either en_synonymous or en_associative (default) retina, image scalar (default: 2), square or circle (default) image, encoding type, and sparsity terms = ImageApi(client).getImageForExpression("en_synonymous", body, 2, "square","base64/png", '1.0') # Chose image name image_name = file_name.replace(".txt","") fh = open(image_name + "_fpImage.png", "wb") fh.write(terms.decode('base64'))
def setUp(self): self.api = TextApi(testConfiguration.client)
def summarize(text, len_sentences = 5, retina = retina, apiKey = apiKey, min_length = min_length, similarity_measure = similarity_measure) : sentences = [sentence for sentence in text.split(".") if len(sentence) >= min_length] # Make request to cortical to compare sentence by sentence to get distance # for graph # graph representation, to use : graph[0][1] is edge of vertex 0 to 1 graph = nx.Graph() request_body = [] for s in sentences: request_body.append({"text":s}) request_body = json.dumps(request_body) client = ApiClient(apiKey=apiKey, apiServer="http://api.cortical.io/rest") api = TextApi(client) fingerprints = api.getRepresentationsForBulkText(retina, request_body) pos_fingerprints = zip(range(0,len(sentences)),fingerprints) double_pos_fingerprints = itertools.combinations(pos_fingerprints, 2) for (pos1,finger1),(pos2,finger2) in double_pos_fingerprints: graph.add_weighted_edges_from([(pos1,pos2, cosine_similarity(finger1.positions, finger2.positions))]) print(pos1,pos2) """ request_body = [] for s in sentences: for s2 in sentences: request_body.append([{"text":s},{"text":s2}]) request_body = json.dumps(request_body) client = ApiClient(apiKey=apiKey, apiServer="http://api.cortical.io/rest") api = CompareApi(client) print("Sending to compare API") metrics = api.compareBulk(retina, request_body) print("Receive compare API") for i in range(len(sentences)): for j in range(len(sentences)): graph.add_weighted_edges_from([(i,j,metrics[i*len(sentences)+j])].cosineSimilarity) """ pos_fingerprints = zip(range(0,len(sentences)),fingerprints) # Compute pagerank for all vertex in matrix, refer to https://en.wikipedia.org/wiki/PageRank page_rank = nx.pagerank(graph) sorted_rank = [(page_rank[key],key) for key in page_rank] sorted_rank.sort(reverse = True) summary_graph = nx.Graph() summaries = [pos for weight,pos in sorted_rank[:len_sentences]] for i in range(0,len(summaries)): for j in range(i+1,len(summaries)): summary_graph.add_weighted_edges_from([(summaries[i],summaries[j], graph[i][j]['weight'])]) gr = nx.minimum_spanning_tree(summary_graph) visited = {} summary = [] for tup in list(nx.dfs_edges(gr)): for first in tup: if (visited.get(first) is None): summary.append(sentences[first]) visited[first] = True return summary
def gettokens(txt): return api.getTokensForText(config.RETINA_NAME, txt) def getslices(txt): return api.getSlicesForText(config.RETINA_NAME, txt) if __name__ == '__main__': client = config.client #api = TermsApi(client) #terms = api.getTerm("en_associative", term="apple", get_fingerprint=True) api = TextApi(client) scpr = scraper() kwords = [] tokens = [] targettxt = url2txt(scpr, TARGET_COMPANY_URL) candidatetxts = PDF2txt("resume.pdf") for i in range(len(candidatetxts)): kwords += getkw(candidatetxts[i]) tokens += gettokens(candidatetxts[i]) print("Keywords extracted, Tokens ready") #slices=[getslices(candidatetxts[i]) for i in range(len(candidatetxts))] if testing_slate: print([r.encode('utf-8') for r in kwords]) print([r.encode('utf-8') for r in tokens])
# Code to get fingerprints from a string # Body = "Semantic fingerprints are cool." # Code to get fingerprints from a .txt file put filename file_name = "15_UTX.txt" with open('Company_Descriptions/' + file_name, "r") as myfile: body = myfile.read().replace('\n', '') #################################################### ######### Code for fingerprint (vector) ############ #################################################### # Chose either en_synonymous or en_associative retina text = TextApi(client).getRepresentationForText("en_synonymous", body) print text[0].positions #################################################### ############# Code for keywords list ############### #################################################### # Chose either en_synonymous or en_associative retina terms = TextApi(client).getKeywordsForText("en_synonymous", body) print terms ##################################################### ########## Code for fingerprint (image) ############# ##################################################### body = '{"text":"%s"}' % body