def testAddDocNormalCase(self): print "SuperDocGeneratorTest: testing SuperDocGenerator.add_doc normal case..." test_meta = SupremeCourtOpinionMetadata() test_meta.case_num = "No. 99" test_doc = Document(test_meta, OPINION_TEXT, TEST_PICKLE_PATH) self.assertEqual(len(self.test_generator.doc_list), 5) self.test_generator.add_doc(test_doc) self.assertEqual(len(self.test_generator.doc_list), 6) self.assertEqual(self.test_generator.doc_list[5], test_doc)
def _parse_html_file(path_info): '''Open the html file with the given path then parse the file.''' file, file_dir, url = path_info with open(file_dir + file, 'r') as html: try: document = Document(url, file_dir, html).export() return document except Exception as ex: message = 'Problem parsing file ' + file log_unsuccessful('parse')(message=message, exception=ex)
def generateTrainFeatures(client_socket, infile, featurefile): #------------------------------------------------ doc = Document(infile) all_sentences, all_offset = doc.all_sentences() #------------------------------------------------ # Positive sentences pos_sents, offset = doc.section_sentences('abstract') sent_indices = range(offset, offset + len(pos_sents)) #----------------------------------------- # Sectional Ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sec_indices = sent2Section(doc, sent_indices) #----------------------------------------- # Count ranker #count_ranker = Ranker(all_sentences, tfidf=False) #----------------------------------------- for sentence, sent_idx, sec_idx in zip(pos_sents, sent_indices, sec_indices): feature_string = '+1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx) writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ # Negative sentences neg_ranker = TextRank(all_sentences) neg_ranker.rank() num = 5 x = -1 neg_sents = [] sent_indices = [] while num > 0: idx = neg_ranker.scores[x][0] + all_offset x -= 1 if not validSentence(doc[idx]): continue else: sent_indices.append(idx) neg_sents.append(doc[idx].sentence.encode('utf-8')) num -= 1 sec_indices = sent2Section(doc, sent_indices) #------------------------------------------------ for sentence, sent_idx, sec_idx in zip(neg_sents, sent_indices, sec_indices): feature_string = '-1' tree = parseTrees(getDepParse(client_socket, sentence)) feature_string += processTree(tree, sec_ranker, sec_idx) writeToFile(featurefile, feature_string + '\n', 'a') #------------------------------------------------ print "File processed to create feature vectors for training."
def new(self, domain, name): ''' Create a new document :param domain: Domain for the document :param name: Unique name for the document (in the domain) :return: Document ''' doc_id = self.__engine.create_new_doc(domain, name) doc = Document(self.__engine, doc_id) return doc
def getDocument(self, text): other = {} modeA = False modeW = False info = "" identifier = "" author = "" texte = "" st = text.split("\n") s = "" for s in st: if (s.startswith(".I")): identifier = s[3:] continue if (s.startswith(".")): if (modeW): texte = info info = "" modeW = False if (modeA): author = info info = "" modeA = False if (s.startswith(".W")): modeW = True info = s[2:] continue if (s.startswith(".A")): modeA = True info = s[2:] continue if ((modeW) or (modeA)): #print "add "+s info += " " + s if (modeW): texte = info info = "" modeW = False if (modeA): author = info info = "" modeA = False other["text"] = texte[4:] doc = Document(identifier, texte[2:], other) return doc
def parse(self, url, file_type, file_content): # Parse the file as a HTML file. # Reference from: https://stackoverflow.com/questions # 30565404/remove-all-style-scripts-and-html-tags-from-an-html-page text = file_content title = '' if 'html' in file_type: # Clean the file. Don't save HTML markup soup = BeautifulSoup(file_content, 'html.parser') # Remove all javascript and stylesheet code. for script in soup(["script", "style"]): script.extract() title = soup.title.string # Get the title of this file. # print("The title of this file is: ", title) text = soup.body.get_text() # Get the body of this file. lines = (line.strip() for line in text.splitlines()) # Build a chunk of tokens. chunks = [] for line in lines: for phrase in line.split(" "): # Split with space. chunks.append(phrase.strip()) # Drop blank lines. text = '\n'.join(chunk for chunk in chunks if chunk) # Write to a file. self.doc_id += 1 filename = "Doc#" + str(self.doc_id) + '.txt' # Ensure the file will closed. with open(filename, 'w', encoding='utf-8') as f: f.write(text) # I only give id to document I'm gonna parse. document = Document(url, self.doc_id, filename, file_type, self.stop_words) document.filter() document.stem() document.collection() # print("There're", len(document.term), "terms in document", filename) if 'html' in file_type: document.set_title(title) # Duplicate Detection for d in self.docs: if self.duplicate_detection(d, document) == 1: # print("The content of Doc#{} is exact duplicate with Doc#{}, so, we won't parse Doc#{}." # .format(document.get_id(), d.get_id(), document.get_id())) self.url_already_seen = self.url_already_seen.union( {str(document.get_url())}) return False self.docs.append(document) return True
def spellcheck(file_name): print 'spellcheck' d = Document(file_name) s = SpellcheckAPI() tmp = d.get_text() if tmp: head, tail = os.path.split(file_name) out_name = 'samples/output/corrections/%s' % tail print(s.spellcheck(tmp)) f = open(out_name, 'w') f.write(s.spellcheck(tmp)) f.close()
def map(self, line): # TODO: call `self.emit(key, value)` instance = Document(line) min_dist = sys.maxsize key = -1 for cluster in self.clusters: dist = MathUtil.compute_distance(map1=cluster.tfidf, map2=instance.tfidf) if dist < min_dist: key = cluster.uid min_dist = dist self.emit(key, line) #instance.__str__()
def map(self, line): #find cluster assignment by brute force doc = Document(line) cluster_uid = None sqdist_to_nearest = float('inf') for cluster_k in self.clusters: sqdist_k = MathUtil.compute_distance(map1 = cluster_k.tfidf, map2 = doc.tfidf, squared=True) if sqdist_k <= sqdist_to_nearest: cluster_uid = cluster_k.uid #dutifully emit. self.emit(key = cluster_uid, value = doc) return
def read_clustered_corpus(path): result = [] for directory in os.listdir(path): cluster = [] for file in os.listdir(os.path.join(path, directory)): text_file = read_text_file(os.path.join(path, directory, file)) document = Document(text_file) cluster.append(document) result.append(cluster) return result
def get(self, domain, name): ''' Retrieve a document :param domain: Name of the domain to get document from :param name: Name of the document to retrieve (within domain) :return: Document objects ''' doc_id = self.__engine.get_document_id(domain, name) if doc_id is None: raise KeyError("Document doesn't exist: %s \ %s" % (domain, name)) return Document(self.__engine, doc_id)
def __init_context(self): """ Spidermonkey Context initialization. """ document = Document(self) self.__dict__['__cx'] = self.__dict__['__rt'].new_context(alertlist = []) self.__dict__['__sl'] = [] self.__dict__['__fl'] = [document] self.__init_properties(document) self.__init_methods() self.__finalize_context()
def build(self): self.builder = gtk.Builder() self.builder.add_from_file(os.path.join(self.datadir, 'ui', 'snippets.ui')) handlers_dic = { 'on_dialog_snippets_response': self.on_dialog_snippets_response, 'on_dialog_snippets_destroy': self.on_dialog_snippets_destroy, 'on_button_new_snippet_clicked': self.on_button_new_snippet_clicked, 'on_button_import_snippets_clicked': self.on_button_import_snippets_clicked, 'on_button_export_snippets_clicked': self.on_button_export_snippets_clicked, 'on_button_remove_snippet_clicked': self.on_button_remove_snippet_clicked, 'on_entry_tab_trigger_focus_out': self.on_entry_tab_trigger_focus_out, 'on_entry_tab_trigger_changed': self.on_entry_tab_trigger_changed, 'on_entry_accelerator_focus_out': self.on_entry_accelerator_focus_out, 'on_entry_accelerator_focus_in': self.on_entry_accelerator_focus_in, 'on_entry_accelerator_key_press': self.on_entry_accelerator_key_press, 'on_source_view_snippet_focus_out': self.on_source_view_snippet_focus_out, 'on_tree_view_snippets_row_expanded': self.on_tree_view_snippets_row_expanded, 'on_tree_view_snippets_key_press': self.on_tree_view_snippets_key_press} self.builder.connect_signals(handlers_dic) self.build_tree_view() self.build_model() image = self['image_remove'] image.set_from_stock(gtk.STOCK_REMOVE, gtk.ICON_SIZE_SMALL_TOOLBAR) source_view = self['source_view_snippet'] manager = get_language_manager() lang = manager.get_language('snippets') if lang: source_view.get_buffer().set_highlight_syntax(True) source_view.get_buffer().set_language(lang) self.snippets_doc = Document(None, source_view) combo = self['combo_drop_targets'] combo.set_text_column(0) entry = combo.child entry.connect('focus-out-event', self.on_entry_drop_targets_focus_out) entry.connect('drag-data-received', self.on_entry_drop_targets_drag_data_received) lst = entry.drag_dest_get_target_list() lst = gtk.target_list_add_uri_targets(entry.drag_dest_get_target_list(), self.TARGET_URI) entry.drag_dest_set_target_list(lst) self.dlg = self['dialog_snippets'] if self.default_size: self.dlg.set_default_size(*self.default_size)
def newDiagram(self, widget, data=None): newDocument = Document(self.tabsPanel.get_current_page() + 1) scrollArea = gtk.ScrolledWindow() scrollArea.set_policy(gtk.POLICY_ALWAYS, gtk.POLICY_ALWAYS) scrollArea.add_with_viewport(newDocument) n = self.tabsPanel.append_page( scrollArea, gtk.Label("Diagram %d" % (self.tabsPanel.get_n_pages() + 1))) scrollArea.show_all() self.tabsPanel.set_current_page(n) self.documentManager.documents.append(newDocument)
def __init__(self): ''' In initialization, the class save a reference of DBlink class and two references of Document class, one for info of last query and one to alias. ''' self.dblink=DBlink() #the lastquery document must be into personal path (into HOME for Linux and MacOS, into Documents for Windows) self.lastquery=Document("lastquery.txt") #the alias document must be into class' path path=os.path.dirname(os.path.abspath(__file__)) self.alias=KL_Document("alias.txt",path) #default table for mecordex user self.default_table='MEDCORDEX'
def __init__(self, vectorSize=100, windowSize=5): super() self.document = Document() if not Path(c.doc2VecModel).exists(): docs = [TaggedDocument(doc, [i]) for i, doc in enumerate(self.document.docList)] print(docs) self.model = Doc2Vec(vector_size=vectorSize, window=windowSize, min_count=5, workers=4, epochs=40, alpha=0.025) self.model.build_vocab(docs) self.model.train(docs, total_examples=self.model.corpus_count, epochs=self.model.epochs) self.model.save("./doc2VecModel") else: self.model = Doc2Vec.load(c.doc2VecModel)
def generate_raw_data(input_file, embed_map, gen_type='gold'): print('loading data from %s' % input_file) data_reader = jsonlines.open(input_file) raw_data = [] for doc_data in data_reader.iter(): doc = Document(doc_data, embed_map) if gen_type == 'gold': raw_data += doc.generate_gold_anaphor_data() else: raw_data += doc.generate_candidate_anaphor_data() print("---> total number of training pairs: %s" % len(raw_data)) return raw_data
def classifyDoc(document): featurefile = DIR['DATA'] + 'features_svm.txt' classify = DIR['BASE'] + "lib/svm-light/svm_classify" model = DIR['DATA'] + "sec-tfidf-model.txt" outfile = DIR['DATA'] + "svm-out-sent.txt" #sumlength = 5 client_socket = getConnection() doc = Document(document) #----------------------------------------- # Clubbing sentences in sections and passing to the ranker sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) sec_ranker = Ranker(sections) sents, offset = doc.all_sentences() ranker = TextRank(sents) ranker.rank() #----------------------------------------- sents, sent_indices = getSecRankedSent(doc) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding # section index sec_indices = sent2Section(doc, sent_indices) summary = [] classified = [] sum_len = 0 for sent, sec_idx in zip(sents, sec_indices): #----------------------------------------- # dependency parse tree = parseTrees(getDepParse(client_socket, sent)) #----------------------------------------- deleteFiles([featurefile]) feature_string = "+1" feature_string += processTree(tree, sec_ranker, sec_idx, False) writeToFile(featurefile, feature_string + '\n', 'a') deleteFiles([outfile]) subprocess.call([classify, featurefile, model, outfile]) with open(outfile, 'r') as ofile: sent_val = float(ofile.read().strip()) classified.append((sent, sent_val)) for sent, val in sorted(classified, key=itemgetter(1)): summary.append(sent) sum_len += len(sent.split(' ')) if sum_len > 130: break writeToFile(DIR['DATA'] + "svm_summary.txt", '\n'.join(summary), 'w') print '\n'.join(summary)
def __init__(self, dataDir, wordToIdMap, wordList): self.D = 0 # The number of documents # self.clusterNoArray = [] self.documents = [] with open(dataDir) as input: line = input.readline() while line: self.D += 1 obj = json.loads(line) text = obj['textCleaned'] document = Document(text, wordToIdMap, wordList, int(obj['tweetId'])) self.documents.append(document) line = input.readline() print("number of documents is ", self.D)
def map(self, line): # Key is cluster id - clusters stored in self.clusters # Value is the line dist = float("inf") temp_dist = float("inf") doc = Document(line) key = doc.uid for c in self.clusters: temp_dist = MathUtil.compute_distance(doc.tfidf,c.tfidf) if temp_dist < dist: dist = temp_dist key = c.uid self.emit(str(key),str(doc))
def four_frames_test(): # make directories original_pic_dir = 'tests/four-frames/original-pictures' dir_for_bigger_images = 'tests/four-frames' + os.sep + settings.images_ready_for_ocr if not os.path.isdir(dir_for_bigger_images): os.mkdir(dir_for_bigger_images) dir_for_hocr = 'tests/four-frames' + os.sep + settings.hocr_dir if not os.path.isdir(dir_for_hocr): os.mkdir(dir_for_hocr) dir_for_xml = 'tests/four-frames' + os.sep + settings.xml_dir if not os.path.isdir(dir_for_xml): os.mkdir(dir_for_xml) # make initial run through the images for filename in os.listdir(original_pic_dir): # resize full_path = original_pic_dir + os.sep + filename full_path_for_new_image = dir_for_bigger_images + os.sep + filename initial_ocr.resize_image(full_path, full_path_for_new_image, redo=True, part='digital reading') # run tesseract full_path_for_hocr = dir_for_hocr + os.sep + filename initial_ocr.run_tesseract_on_image(full_path_for_new_image, full_path_for_hocr, redo=True) # make corrections correct_bags = ocr_cleanup.get_correct_bags() word_to_doc = ocr_cleanup.make_matching_dictionary(correct_bags) ocr_cleanup.cleanup_hocr_files(dir_for_hocr, dir_for_xml, correct_bags, word_to_doc) # find differences for filename in os.listdir(dir_for_xml): full_path = dir_for_xml + os.sep + filename doc = Document(full_path) lines = [str(l).strip() for l in doc.lines if len(str(l).strip()) > 0] filename_with_txt_ending = filename[:-len('png.hocr')] + 'txt' path_to_correct_lines_file = 'tests/four-frames' + os.sep + 'limited-correct-output-text' + os.sep + filename_with_txt_ending with open(path_to_correct_lines_file, 'r') as infile: correct_lines = [line.strip() for line in infile] if len(lines) != len(correct_lines): raise Exception( 'lines has length {0} but correct_lines has length {1} for {2}' .format(len(lines), len(correct_lines), filename)) for i in range(len(lines)): if lines[i] != correct_lines[i]: raise Exception( 'lines[{0}] has value\n{1}\n but correct_lines[{0}] has value\n{2}\n for {3}' .format(i, lines[i], correct_lines[i], filename)) print('Four frames test passed')
def get_test_sentences(infile, outfile, backup=False): doc = Document(infile) sentences, offset = doc.all_sentences() ranker = TextRank(sentences) ranker.rank() num = 7 x = 0 samples = '' sent_idx = [] while num > 0: idx = ranker.scores[x][0] + offset x += 1 #if not validSentence(doc[idx]): # continue #else: # sent_idx.append(idx) # samples += doc[idx].sentence.encode('utf-8') + '\n' # num -= 1 sent_idx.append(idx) samples += doc[idx].sentence.encode('utf-8') + '\n' num -= 1 #--------------------------------------------------- # Storing the sentence in the dictionary for pickling for display infi = re.match(r'/home/ankur/devbench/scientific/scisumm/demo/(.+)-parscit-section\.xml', infile).group(1) key = infi + "-" + str(idx) test_data[key] = {'sentence': doc[idx].sentence.encode('utf-8'), 'textrank': ranker.scores[x - 1][1], 'contextpre': getContext(doc, idx, -2), 'contextpos': getContext(doc, idx, 2)} writeToFile(outfile, samples, 'w') #ranker = Ranker(sentences, tfidf=False) #return ranker, sent_idx #----------------------------------------- # Calculating the sectional TF-IDF sections = [] for sec, block in doc.document.items(): sentences = '' for key in sorted(block.keys()): sentences += (str(block[key])) sections.append(sentences) ranker = Ranker(sections) #----------------------------------------- # The sent_idx needs to be converted to reflect the corresponding section # index section_idx = sent2Section(doc, sent_idx) if backup: backupfile = DIR['BASE'] + "data/backup.txt" writeToFile(backupfile, "\n---------" + str(doc) + "---------\n", 'a') writeToFile(backupfile, samples, 'a') return ranker, section_idx, sent_idx
def __init__(self): super() self.document = Document() self.nodes = self.document.nodeList self.docs = [item.lower().split() for item in self.document.rawDocList] if Path('word2vecTrained.model').exists(): self.word2Vec = Word2Vec.load('word2vecTrained.model') else: self.word2Vec = Word2Vec(self.docs, size=100, window=4, min_count=1, workers=3) self.word2Vec.save('word2vecTrained.model')
def load_data(src_file, tgt_file): docs = [] with open(src_file, 'r', encoding='utf-8') as src_reader, \ open(tgt_file, 'r', encoding='utf-8') as tgt_reader: for src_line, tgt_line in zip(src_reader, tgt_reader): src_line = src_line.strip() tgt_line = tgt_line.strip() if src_line == "" or tgt_line == "": docs.append(None) continue src_sents = src_line.split('##SENT##') tgt_sents = tgt_line.strip().split('##SENT##') docs.append(Document(src_sents, tgt_sents)) return docs
def map(self, line): # TODO: Your code goes here -- call `self.emit(key, value)` doc = Document(line) shortest = float('inf') # current cluster lable of this data point cur_center = 999 for cluster in self.clusters: dist_temp = self.l2_norm(doc.tfidf, cluster.tfidf) if dist_temp < shortest: shortest = dist_temp cur_center = cluster.uid self.emit(str(cur_center), str(doc))
def predict(test_data): with open('predictions.csv', 'w', encoding='utf-8') as output: writer = csv.writer(output) writer.writerow([ 'document', 'predict_class', 'predict_score', 'exp_predict_score' ]) for instance in test_data.iterrows(): doctext = instance[1]['document'] doc = Document(doctext) predict_clas = max(stats.classes, key=lambda c: _compute_score(doc, c)) predict_score = _compute_score(doc, predict_clas) exp_predict_score = np.exp(predict_score) writer.writerow( [doctext, predict_clas, predict_score, exp_predict_score])
def process_twitter_folder(folder, metadata): textfiles = [ join(folder, f) for f in listdir(folder) if isfile(join(folder, f)) and f.endswith(".txt") ] #textfiles = textfiles[:2] #limit for quick processing if you wish, but should be ok to work with all. documents = [] for tf in textfiles: textname = splitext( split(tf)[1])[0] #extract just username from filename. print('Processing ' + textname) document = Document(textname, metadata) document.process_document_from_textfile(tf) documents.append(document) return documents
def __init__(self, manualFilepath=config.manualPath, k=60): documents = Document(manualFilepath) docs = documents.docList self.Doc = documents self.docTree = documents.tree self.nodes = documents.nodeList self.uniqueTerms = getAllUniqueTerms(docs) self.allTopics = documents.allTopics termDocMatrix = getTermDocMatrix(self.uniqueTerms, docs) u, s, vh = np.linalg.svd(termDocMatrix, full_matrices=False) S = np.diag(s) # # ersetzen durch variable K self.uk = u[:, :k] self.Sk = S[:k, :k] self.vhk = vh[:k, :]
def load_data(path=""): time.sleep(0.5) print("start to load data from path----->", path) time.sleep(0.5) file_list = os.listdir(path) sentences = list() for i in range(len(file_list)): filename = file_list[i] current_path = os.path.join(path, filename) document = Document(filename=current_path) for sentence in document.sentence_list: sentences.append(sentence) return sentences
def search(self): logger = logging.getLogger("qa_logger") logger.info("%s:\tDocument Retrieval", self.id_q) search_engines = self._get_search_engines() try: num = int(MyConfig.get("document_retrieval", "n_results")) except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) num = 10 results = [] for engine in search_engines: try: results += engine.search(self.query, count=num) except Exception as e: logger = logging.getLogger("qa_logger") logger.error("Problem with search engine.") logger.debug(e) sys.exit(1) doc_list = [] # rank loops over [0..num-1] rank = 0 # ignore repeated urls unique_urls = set() for resource in results: if resource.url in unique_urls: continue unique_urls.add(resource.url) # rank+1 loops over [1..num] # rank+1 is the relative position of the results doc_list.append(Document(resource, rank + 1)) rank = (rank + 1) % num try: if MyConfig.get("persistence", "document") == "True": output = open("documentos.pkl", "wb") pickle.dump(doc_list, output, 0) output.close() except MyConfigException as e: logger = logging.getLogger("qa_logger") logger.warning(str(e)) return doc_list