def generateTestFeatures(infile): doc = Document(infile) #------------------------------------------------ # For display and analysis dir, filename = os.path.split(infile) fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1) #------------------------------------------------ all_sentences, all_offset = doc.all_sentences() ranker = TextRank(all_sentences) ranker.rank() num = 7 x = 0 test_sents = [] sent_indices = [] while num > 0: idx = ranker.scores[x][0] + all_offset x += 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) test_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 #------------------------------------------------ # For display and analysis key = fcode + '-' + str(idx) test_data[key] = { 'sentence': doc[idx].sentence.encode('utf-8'), 'textrank': ranker.scores[x - 1][1], 'contextpre': getContext(doc, idx, -2), 'contextpos': getContext(doc, idx, 2) } #----------------------------------------- for sentence, sent_idx in zip(test_sents, sent_indices): key = fcode + '-' + str(sent_idx) print key print test_data[key]['contextpre'] print "----Main sentence Start----" print test_data[key]['sentence'] print "----Main sentence End----" print test_data[key]['contextpos'] feature_string = raw_input() feature_string += '1' test_data[key]['reallbl'] = feature_string
def generateTestFeatures(infile): doc = Document(infile) #------------------------------------------------ # For display and analysis dir, filename = os.path.split(infile) fcode = re.match(r'(.+)-parscit-section\.xml', filename).group(1) #------------------------------------------------ all_sentences, all_offset = doc.all_sentences() ranker = TextRank(all_sentences) ranker.rank() num = 7 x = 0 test_sents = [] sent_indices = [] while num > 0: idx = ranker.scores[x][0] + all_offset x += 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) test_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 #------------------------------------------------ # For display and analysis key = fcode + '-' + str(idx) test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'), 'textrank': ranker.scores[x - 1][1], 'contextpre': getContext(doc, idx, -2), 'contextpos': getContext(doc, idx, 2)} #----------------------------------------- for sentence, sent_idx in zip(test_sents, sent_indices): key = fcode + '-' + str(sent_idx) print key print test_data[key]['contextpre'] print "----Main sentence Start----" print test_data[key]['sentence'] print "----Main sentence End----" print test_data[key]['contextpos'] feature_string = raw_input() feature_string += '1' test_data[key]['reallbl'] = feature_string
def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() looper = 20 num = 10 x = 0 summary = [] sent_idx = [0] sum_len = 0 while num > 0: idx = ranker.scores[x][0] + offset x += 1 if not validSentence(doc[idx]): continue elif doc.get_section_name(idx) == 'abstract': continue sent_idx[0] = idx #----------------------------------------- # dependency parse tree = parseTrees(getDepParse(client_socket, doc[idx].sentence.encode('utf-8'))) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_idx = sent2Section(doc, sent_idx) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx[0], False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) if sent_val > 0: summary.append(doc[idx].sentence.encode('utf-8')) num -= 1 sum_len += len(doc[idx].sentence.encode('utf-8').split(' ')) if sum_len > 130: break looper -= 1 if looper == 0: print "Looper Done" break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)
def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() looper = 20 num = 10 x = 0 summary = [] sent_idx = [0] sum_len = 0 while num > 0: idx = ranker.scores[x][0] + offset x += 1 if not validSentence(doc[idx]): continue elif doc.get_section_name(idx) == 'abstract': continue sent_idx[0] = idx #----------------------------------------- # dependency parse tree = parseTrees( getDepParse(client_socket, doc[idx].sentence.encode('utf-8'))) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_idx = sent2Section(doc, sent_idx) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx[0], False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) if sent_val > 0: summary.append(doc[idx].sentence.encode('utf-8')) num -= 1 sum_len += len(doc[idx].sentence.encode('utf-8').split(' ')) if sum_len > 130: break looper -= 1 if looper == 0: print "Looper Done" break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)