def ilg_text_to_lines(path): """ Converts an ilg file to text lines Parameters ---------- path : string path to ilg file Returns ------- list a sanitized list of lines in the file """ delimiter = None with open(path, encoding='utf-8-sig', mode='r') as f: text = f.read() if delimiter is not None and delimiter not in text: e = DelimiterError( 'The delimiter specified does not create multiple words. Please specify another delimiter.' ) raise (e) lines = enumerate(text.splitlines()) lines = [(x[0], x[1].strip().split(delimiter)) for x in lines if x[1].strip() != ''] return lines
def text_to_lines(path): """ Parse a text file into lines. Parameters ---------- path : str Fully specified path to text file Returns ------- list Non-empty lines in the text file """ delimiter = None with open(path, encoding='utf-8-sig', mode='r') as f: text = f.read() if delimiter is not None and delimiter not in text: e = DelimiterError( 'The delimiter specified does not create multiple words. Please specify another delimiter.' ) raise (e) lines = [ x.strip().split(delimiter) for x in text.splitlines() if x.strip() != '' ] return lines
def parse_discourse(self, path): ''' Parse a CSV file for later importing. Parameters ---------- path : str Path to CSV file Returns ------- :class:`~polyglotdb.io.discoursedata.DiscourseData` Parsed data from the file ''' name = os.path.splitext(os.path.split(path)[1])[0] for a in self.annotation_types: if a.name == 'transcription' and not isinstance( a, TranscriptionAnnotationType): raise (CorpusIntegrityError( ('The column \'{}\' is currently ' 'not being parsed as transcriptions ' 'despite its name. Please ensure correct ' 'parsing for this column by changing its ' '\'Annotation type\' in the parsing ' 'preview to the right.').format(a.name))) for a in self.annotation_types: a.reset() with open(path, encoding='utf-8') as f: headers = f.readline() headers = headers.split(self.column_delimiter) if len(headers) == 1: e = DelimiterError(('Could not parse the corpus.\n\Check ' 'that the delimiter you typed in matches ' 'the one used in the file.')) raise (e) for line in f.readlines(): line = line.strip() if not line: #blank or just a newline continue d = {} for i, (k, v) in enumerate( zip(headers, line.split(self.column_delimiter))): v = v.strip() self.annotation_types[i].add([(v, )]) pg_annotations = self._parse_annotations() data = DiscourseData(name, pg_annotations, self.hierarchy) for a in self.annotation_types: a.reset() return data