def __init__(self, cacm_words_path, common_words_filename): with open(cacm_words_path, 'r') as f: self.lines = f.readlines() self.current_line_number = 0 self.document_begin_regex = re.compile('^\.I\s(?P<id>\d*)') self.category_markers = [ '.I', '.T', '.W', '.B', '.A', '.N', '.X', '.K', '.C' ] self.documents = [] self.text_processor = Process_text(common_words_filename)
def __init__(self, cacm_words_path, common_words_filename): with open(cacm_words_path, 'r') as f: self.lines = f.readlines() self.current_line_number = 0 self.document_begin_regex = re.compile('^\.I\s(?P<id>\d*)') self.category_markers = ['.I', '.T', '.W', '.B', '.A', '.N', '.X', '.K', '.C'] self.documents = [] self.text_processor = Process_text(common_words_filename)
class Parse_cacm: def __init__(self, cacm_words_path, common_words_filename): with open(cacm_words_path, 'r') as f: self.lines = f.readlines() self.current_line_number = 0 self.document_begin_regex = re.compile('^\.I\s(?P<id>\d*)') self.category_markers = ['.I', '.T', '.W', '.B', '.A', '.N', '.X', '.K', '.C'] self.documents = [] self.text_processor = Process_text(common_words_filename) def parse_file(self): # -1 because we do all the iteration stuff at the end of the loop, instead of the beginning. Also, last line is void so we don't care. while (self.current_line_number < len(self.lines)-1): line = self.lines[self.current_line_number] match = self.document_begin_regex.match(line) # Parses the document. Stops on a new document declaration (line = .I \d+) if(match): self.process_document(int(match.group('id'))) else: self.current_line_number += 1 # The index is of the form [(document_id, {word: tf, ...}), ...] # print self.documents[47] return self.documents def process_document(self, document_id): content = '' self.current_line_number += 1 line = self.lines[self.current_line_number] while(line[0:2] != '.I' and self.current_line_number < len(self.lines)-1): # We are still in the same document # process_contents aggregates the content of a category (title, abstract, keywords). # Stops on a new category declaration (lines[self.current_line_number] will start with .[ITWABNKC] after execution) if(line[0:2] in ('.T', '.W', '.K')): content += ' ' + self.process_content() else: self.current_line_number += 1 line = self.lines[self.current_line_number] self.documents.append((document_id, self.text_processor.sanitize_rawtext_with_stats(content))) # The document has been processed. return None def process_content(self): self.current_line_number += 1 line = self.lines[self.current_line_number] content = '' while(line[0:2] not in self.category_markers and self.current_line_number < len(self.lines)-1): # content will be multiline, but we don't care since we will process all the words individually content += line self.current_line_number += 1 line = self.lines[self.current_line_number] return content
class Parse_cacm: def __init__(self, cacm_words_path, common_words_filename): with open(cacm_words_path, 'r') as f: self.lines = f.readlines() self.current_line_number = 0 self.document_begin_regex = re.compile('^\.I\s(?P<id>\d*)') self.category_markers = [ '.I', '.T', '.W', '.B', '.A', '.N', '.X', '.K', '.C' ] self.documents = [] self.text_processor = Process_text(common_words_filename) def parse_file(self): # -1 because we do all the iteration stuff at the end of the loop, instead of the beginning. Also, last line is void so we don't care. while (self.current_line_number < len(self.lines) - 1): line = self.lines[self.current_line_number] match = self.document_begin_regex.match(line) # Parses the document. Stops on a new document declaration (line = .I \d+) if (match): self.process_document(int(match.group('id'))) else: self.current_line_number += 1 # The index is of the form [(document_id, {word: tf, ...}), ...] # print self.documents[47] return self.documents def process_document(self, document_id): content = '' self.current_line_number += 1 line = self.lines[self.current_line_number] while (line[0:2] != '.I' and self.current_line_number < len(self.lines) - 1 ): # We are still in the same document # process_contents aggregates the content of a category (title, abstract, keywords). # Stops on a new category declaration (lines[self.current_line_number] will start with .[ITWABNKC] after execution) if (line[0:2] in ('.T', '.W', '.K')): content += ' ' + self.process_content() else: self.current_line_number += 1 line = self.lines[self.current_line_number] self.documents.append( (document_id, self.text_processor.sanitize_rawtext_with_stats(content))) # The document has been processed. return None def process_content(self): self.current_line_number += 1 line = self.lines[self.current_line_number] content = '' while (line[0:2] not in self.category_markers and self.current_line_number < len(self.lines) - 1): # content will be multiline, but we don't care since we will process all the words individually content += line self.current_line_number += 1 line = self.lines[self.current_line_number] return content