def excel_to_corpus(excel_path, corpus_path): '''NB! Make sure to use .xls file extension for Excel files.''' corpus = PyCorpus(corpus_path) excel = ExcelFile(excel_path) # as we do not know the number of sheets, we parse all of them # until we obtain a error idx = 0 while True: try: df = excel.parse(str(idx)) # recreate some information that was modified when exporting to xls new_df = dict() for col in df.columns: data = [] for v in df[col]: if type(v) == float and math.isnan(v): data.append(None) elif v == 0: data.append(False) elif v == 1: data.append(True) else: data.append(v) new_df[col] = Series(data) corpus[str(idx)] = DataFrame(new_df) except xlrd.biffh.XLRDError: break idx += 1 corpus.close()
def corpus_to_excel(corpus_path, excel_path): '''NB! Make sure to use .xls file extension for Excel files.''' corpus = PyCorpus(corpus_path) writer = ExcelWriter(excel_path) for key in corpus: corpus[key].to_excel(writer, sheet_name=key) writer.save() corpus.close()
def boi_to_t3corpus(orig_path, t3_path): '''Parse a t3 corpus, where documents are separated with -- ''' f = codecs.open(orig_path, 'rb', 'utf-8') contents = f.read() f.close() docs = re.split('--\r?\n\r?\n', contents) corpus = PyCorpus(t3_path) for i, doc in enumerate(docs): corpus[str(i+1)] = parse_t3_doc_from_string(doc) corpus.close()
def startup(self, startFresh=False): """ Starts up my persistence engine. This method replaces the stock C{adj} adjacency list attribute with an instance of L{PersistentDict}, or for directed graphs, by replacing the C{succ} and C{pred} adjacency list attributes with two such instances. Important note regarding directed graph persistence =================================================== In directed graphs, we actually overwrite the C{adj} attribute instead of C{succ}. That's because C{adj} is used in the C{NX.Graph} superclass and C{succ} is merely set equal to it in the constructor of C{NX.DiGraph}. When C{x = y} in Python, changing some property of C{y} causes the same change in C{x}. However, we are changing the attribute C{adj} to reference an I{entirely new} object, not just changing its properties. Thus we have to refresh the C{self.succ = self.adj} link after overwriting C{adj}, giving C{succ} a reference to the I{new} C{adj} object. Adding or changing I{items} of either one will show up in the other, so no further hacking is required. @param startFresh: Set this keyword C{True} to clear any persisted content and start fresh. This keyword can also be set in the constructor. Obviously, you should use this option with care as it will B{erase} database entries! @return: A deferred that fires when the persistence engine is ready for use. """ self._uniqueCount = 0 def ID(): self._uniqueCount += 1 thisID = "%s-%d" % (self.name, self._uniqueCount) return hash(thisID) def started(null): self.succ = self.adj if startFresh or self.startFresh: return self.adjacencyListOperation("clear") dList = [] url, kw = self.engineParams kw['nameType'] = self.nodeType # Adjacency lists for dictName in self.adjacencyLists: dictObject = PersistentDict(ID(), url, **kw) setattr(self, dictName, dictObject) dList.append(dictObject.preload()) return defer.DeferredList(dList).addCallback(started)
def crf_model_predict(model_path, corpus, target_path, series_name): f = open(model_path, 'rb') model, kwargs = cPickle.load(f) f.close() s = Corpus(target_path) for doc_id, predictions in crf_predict(model, corpus, **kwargs): doc = corpus[doc_id] doc[series_name] = predictions s[doc_id] = doc sys.stderr.write('Document {0} classified.\n'.format(doc_id)) s.close()
def parse_wikipedia(path, corpus_path): '''path - the directory containing the extracted documents by WikiExtractor.py. corpus_path - the filename to store the parsed corpus. ''' corpus = PyCorpus(corpus_path) def from_path(path): sys.stderr.write('Processing path ' + path + '\n') files = os.listdir(path) for f in files: newpath = os.path.join(path, f) if os.path.isdir(newpath): from_path(newpath) else: sys.stderr.write('Processing file ' + newpath + '\n') get_documents(newpath) def get_documents(path): f = codecs.open(path, 'r', 'utf-8') contents = f.read() f.close() doctexts = contents.split('<doc id="') documents = [] for text in doctexts: text = text.strip() if len(text) < 1: continue # extract the document parts doc_id = int(text[:text.index('"')]) title = text[text.index('title="')+7 : text.index('"', text.index('title="')+7)] text = text[text.index('\n') : text.index('</doc>')].strip() text_stream = cStringIO.StringIO(text.encode('utf-8')) utf8_stream = codecs.getreader('utf-8')(text_stream) corpus[str(doc_id)] = parse_plain_doc_from_stream(utf8_stream) return documents from_path(path) corpus.sync() return corpus
def parse_plain_corpus(plainpath, corpuspath): corpus = PyCorpus(corpuspath) data = codecs.open(plainpath, 'rb', 'utf-8').read() docs = re.split('s*?\r?\n\r?\n', data) data = None corpus.autocommit(False) for doc in docs: lines = re.split('\r?\n', doc.strip()) title = lines[0].strip() contents = '\n'.join(lines[1:]).strip() text_stream = cStringIO.StringIO(contents.encode('utf-8')) utf8_stream = codecs.getreader('utf-8')(text_stream) corpus[title] = parse_plain_doc_from_stream(utf8_stream) corpus.commit() corpus.close()
def __getitem__(self, key): if key in self.subkeys: return PyCorpus.__getitem__(self, key) raise KeyError()
def __init__(self, *args, **kwargs): self.subkeys = [] if 'keys' in kwargs: self.subkeys = set(kwargs['keys']) del kwargs['keys'] PyCorpus.__init__(*args, **kwargs)
def as_treetagger_corpus(orig_path, dest_path, encoding='latin-1', language='english'): assert (orig_path != eng_path) orig = PyCorpus(orig_path) dest = PyCorpus(eng_path) dest.autocommit(False) for doc_id in orig.keys(): dest[doc_id] = as_treetagger_doc(orig[doc_id], encoding=encoding, language=language) dest.commit() orig.close() dest.close()
def as_eng_postagged_corpus(orig_path, eng_path): '''Uses nltk default tagger.''' assert (orig_path != eng_path) orig = PyCorpus(orig_path) dest = PyCorpus(eng_path) dest.autocommit(False) for doc_id in orig.keys(): dest[doc_id] = as_eng_postagged_doc(orig[doc_id]) dest.commit() orig.close() dest.close()
def as_t3corpus(orig_path, t3_path): '''Convert a corpus at orig_path to t3mesta corpus to t3_path.''' orig_corpus = PyCorpus(orig_path) dest_corpus = PyCorpus(t3_path) dest_corpus.autocommit(False) dest_keys = set(dest_corpus.keys()) for key in orig_corpus.keys(): if key not in dest_keys: dest_corpus[key] = as_t3doc(orig_corpus[key]) dest_corpus.commit() orig_corpus.close() dest_corpus.close()
def crf_model_predict_mc(model_path, corpus, target_path, series_name, n): '''Multi core version of crf_model_predict. n - number of processes to use. ''' sys.stderr.write('Dividing documents between {0} processes.\n'.format(n)) doc_ids = list(corpus.keys()) id_lists = [[] for _ in range(n)] idx = 0 for doc_id in corpus.keys(): id_lists[idx].append(doc_id) idx += 1 if idx >= n: idx = 0 sys.stderr.write('Launching processes.\n') dest_names = [] processes = [] for idx, ids in enumerate(id_lists): if len(ids) > 0: folder = tempfile.mkdtemp() # write the new corpus src_name = os.path.join(folder, 'src.corpus') dest_name = os.path.join(folder, 'dest.corpus') tmp_corp = Corpus(src_name) tmp_corp.autocommit(False) for doc_id in ids: tmp_corp[doc_id] = corpus[doc_id] tmp_corp.close() # start the process process = Process(target=crf_process, args=(model_path, src_name, dest_name, series_name)) process.start() sys.stderr.write('Process {0} launched\n'.format(idx)) # store the identificators dest_names.append(dest_name) processes.append(process) for p in processes: p.join() sys.stderr.write('Processes finished!\n') # concatenate temporary outputs target_corp = Corpus(target_path) target_corp.autocommit(False) for dest_name in dest_names: tmp_corp = Corpus(dest_name) for doc_id in tmp_corp: target_corp[doc_id] = tmp_corp[doc_id] tmp_corp.close() target_corp.close() sys.stderr.write('Corpus {0} created'.format(target_path))
def crf_process(model_path, tmp_corpus_path, tmp_target_path, series_name): tmp_corp = Corpus(tmp_corpus_path) crf_model_predict(model_path, tmp_corp, tmp_target_path, series_name) tmp_corp.close()