def csv_to_iepy(filepath): print ('Importing Documents to IEPY from {}'.format(filepath)) from iepy.data.db import DocumentManager if filepath.endswith(".gz"): fin = gzip.open(filepath, "rt") else: fin = open(filepath, "rt") reader = csv.DictReader(fin) expected_fnames = ['document_id', 'document_text'] if not set(reader.fieldnames).issuperset(expected_fnames): msg = "Couldn't find the expected field names on the provided csv {}" sys.exit(msg.format(expected_fnames)) name = os.path.basename(filepath) docdb = DocumentManager() seen = set() for i, d in enumerate(reader): doc_id = d["document_id"] if doc_id in seen: continue seen.add(doc_id) docdb.create_document( identifier=doc_id, text=d["document_text"], metadata={"input_filename": name}, update_mode=True ) print ('Added {} documents'.format(i+1))
def csv_to_iepy(filepath): print ('Importing Documents to IEPY from {}'.format(filepath)) from iepy.data.db import DocumentManager if filepath.endswith(".gz"): fin = gzip.open(filepath, "rt") else: fin = open(filepath, "rt") reader = csv.DictReader(fin) expected_fnames = ['document_id', 'document_text'] if not set(reader.fieldnames).issuperset(expected_fnames): msg = "Couldn't find the expected field names on the provided csv {}" sys.exit(msg.format(expected_fnames)) name = os.path.basename(filepath) docdb = DocumentManager() seen = set() i = 0 while True: try: d = next(reader) except StopIteration: break except csv.Error as error: logger.warn("Couldn't load document: {}".format(error)) continue i += 1 doc_id = d["document_id"] if doc_id in seen: continue seen.add(doc_id) docdb.create_document( identifier=doc_id, text=d["document_text"], metadata={"input_filename": name}, update_mode=True ) print ('Added {} documents'.format(i))
def csv_to_iepy(filepath): print('Importing Documents to IEPY from {}'.format(filepath)) from iepy.data.db import DocumentManager if filepath.endswith(".gz"): fin = gzip.open(filepath, "rt") else: fin = open(filepath, "rt") reader = csv.DictReader(fin) expected_fnames = ['document_id', 'document_text'] if not set(reader.fieldnames).issuperset(expected_fnames): msg = "Couldn't find the expected field names on the provided csv {}" sys.exit(msg.format(expected_fnames)) name = os.path.basename(filepath) docdb = DocumentManager() seen = set() i = 0 while True: try: d = next(reader) except StopIteration: break except csv.Error as error: logger.warn("Couldn't load document: {}".format(error)) continue i += 1 doc_id = d["document_id"] if doc_id in seen: continue seen.add(doc_id) docdb.create_document(identifier=doc_id, text=d["document_text"], metadata={"input_filename": name}, update_mode=True) print('Added {} documents'.format(i))
def csv_to_iepy(filepath): logger.info('Importing Documents to IEPY from {}'.format(filepath)) from iepy.data.db import DocumentManager if filepath.endswith(".gz"): fin = gzip.open(filepath, "rt") else: fin = open(filepath, "rt") reader = csv.DictReader(fin) name = os.path.basename(filepath) docdb = DocumentManager() seen = set() for i, d in enumerate(reader): mid = d["freebase_mid"] if mid in seen: continue seen.add(mid) docdb.create_document(identifier=mid, text=d["description"], metadata={"input_filename": name}) logger.info('Added {} documents'.format(i + 1))
from docopt import docopt from iepy.data.db import DocumentManager if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s") opts = docopt(__doc__, version=0.1) name = opts["<filename>"] if name.endswith(".gz"): fin = gzip.open(name, "rt") else: fin = open(name, "rt") reader = csv.DictReader(fin) name = os.path.basename(name) docdb = DocumentManager() seen = set() for i, d in enumerate(reader): mid = d["freebase_mid"] if mid in seen: continue seen.add(mid) docdb.create_document(identifier=mid, text=d["description"], metadata={"input_filename": name})
per_season.append(season_ep) return per_season if __name__ == '__main__': logging.basicConfig() logger = logging.getLogger('wikia_to_iepy') logger.setLevel(logging.DEBUG) opts = docopt(__doc__, version=0.1) docs = DocumentManager() pages_dict = build_pages_dict(opts['<wikia_zipped_xml_dump_file>']) eps = get_episode(pages_dict, int(opts['<nr_of_seasons>']), opts['--all-episodes-tag'], opts['--season-tag-pattern']) for season_nr, season in enumerate(eps, 1): issues_counter = 0 for i, e in enumerate(season): try: docs.create_document( identifier=e['title'], text='', metadata={ 'raw_text': e['revision']['text']['#text'], 'season': season_nr, 'source': opts['<wikia_zipped_xml_dump_file>'] }) except Exception as err: issues_counter += 1 logger.error('Document not created, %s', err) continue logger.info('Dumped %i episodes from season %i', len(season) - issues_counter, season_nr)
from docopt import docopt from iepy.data.db import DocumentManager if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format=u"%(asctime)s - %(name)s - %(levelname)s - %(message)s") opts = docopt(__doc__, version=0.1) name = opts["<filename>"] if name.endswith(".gz"): fin = gzip.open(name, "rt") else: fin = open(name, "rt") reader = csv.DictReader(fin) name = os.path.basename(name) docdb = DocumentManager() seen = set() for i, d in enumerate(reader): mid = d["freebase_mid"] if mid in seen: continue seen.add(mid) docdb.create_document(identifier=mid, text=d["description"], metadata={"input_filename": name})