Exemplo n.º 1
0
def main():
    args = parse_args()

    input_file = utils.open_compressed_file(args.input_file)
    conn = sqlite3.connect(str(args.sqlite_file))

    create_tables(conn)
    conn.execute('PRAGMA synchronous = OFF')
    conn.execute('PRAGMA journal_mode = MEMORY')

    print('Inserting data...')
    with input_file, conn:
        reader = csv.reader(input_file)
        assert next(reader) == ['timestamp', 'from', 'to']
        records = (parse_record(r) for r in reader)

        db_records = (
            (r.timestamp, args.project, r.from_, r.to)
            for r in records
        )

        conn.executemany(
            'INSERT INTO moves VALUES (?, ?, ?, ?)',
            db_records,
        )

    print('Creating indexes...')
    if args.create_indexes:
        create_indexes()
Exemplo n.º 2
0
def main():
    args = parse_args()

    db_conn = pymysql.connect(**args.mysql_url)

    insert_tpl = '''
    INSERT INTO `mag_papers` (
        `paper_id`,
        `original_paper_title`,
        `normalized_paper_title`,
        `paper_publish_year`,
        `paper_publish_date`,
        `paper_doi`,
        `original_venue_name`,
        `normalized_venue_name`,
        `journal_id_mapped_to_venue_name`,
        `conference_series_id_mapped_to_venue_name`,
        `paper_rank`
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    '''

    if args.create_tables:
        print('Creating tables and indexes')
        create_tables_and_indexes(db_conn.cursor())
        db_conn.commit()

    print('Reading', args.input_csv, '...')
    input_file = utils.open_compressed_file(args.input_csv)
    cursor = db_conn.cursor()
    with input_file, cursor:
        csvreader = csv.reader(
            input_file,
            delimiter='\t',
            quoting=csv.QUOTE_NONE,
        )
        records = (parse_papers_record(r) for r in csvreader)

        records_truncated = ((
            r.paper_id[:50],
            r.original_paper_title[:255],
            r.normalized_paper_title[:255],
            r.paper_publish_year,
            r.paper_publish_date,
            r.paper_doi[:255],
            r.original_venue_name[:255],
            r.normalized_venue_name[:255],
            r.journal_id_mapped_to_venue_name[:255],
            r.converence_series_id_mapped_to_venue_name[:255],
            r.paper_rank,
        ) for r in records)

        records_with_progress = frogress.bar(
            records_truncated,
            steps=args.expected_records,
        )
        cursor.executemany(insert_tpl, records_with_progress)
    db_conn.commit()
Exemplo n.º 3
0
def main():
    logging.config.fileConfig('logging.ini', disable_existing_loggers=False)
    args = parse_args()
    parser_cls = bacparser.parsers.get_parser_cls(args.year)
    if args.format == 'python':
        write = functools.partial(write_python, args.output)
    else:  # 'pickle'
        write = functools.partial(write_pickle, args.output)
    with args.output:
        for filename in args.filenames:
            with open_compressed_file(filename) as f:
                logging.info("Extracting from %s" % (filename, ))
                for i in parser_cls(f):
                    write(i)
Exemplo n.º 4
0
def main():
    logging.config.fileConfig("logging.ini", disable_existing_loggers=False)
    args = parse_args()
    parser_cls = bacparser.parsers.get_parser_cls(args.year)
    if args.format == "python":
        write = functools.partial(write_python, args.output)
    else:  # 'pickle'
        write = functools.partial(write_pickle, args.output)
    with args.output:
        for filename in args.filenames:
            with open_compressed_file(filename) as f:
                logging.info("Extracting from %s" % (filename,))
                for i in parser_cls(f):
                    write(i)
Exemplo n.º 5
0
def main():
    logging.config.fileConfig('logging.ini', disable_existing_loggers=False)
    args = parse_args()
    parser = bacparser.parsers.get_parser(args.year)
    if args.format == 'python':
        write = functools.partial(write_python, args.output)
    else: # 'pickle'
        write = functools.partial(write_pickle, args.output)
    with args.output:
        for filename in args.filenames:
            with open_compressed_file(filename) as f:
                logging.info("Extracting from %s" % (filename,))
                main_table = get_main_table_from_file(f, args.year)
                for i in parser.get_elev(main_table):
                    write(i)
Exemplo n.º 6
0
def main():
    logging.config.fileConfig('logging.ini', disable_existing_loggers=False)
    args = parse_args()
    with args.output:
        csv_writer = UnicodeWriter(args.output)
        csv_writer.writerow(bacparser.models.get_model(args.year)._fields)
        for filename in args.filenames:
                logging.info('Converting %s' % (filename,))
                with open_compressed_file(filename) as f:
                    unpickler = pickle.Unpickler(f)
                    try:
                        while True:
                            o = unpickler.load()
                            csv_writer.writerow(o)
                    except EOFError:
                        pass
def main():
    args = parse_args()

    db_conn = pymysql.connect(**args.mysql_url)

    insert_tpl = """
    INSERT INTO `identifiershistory` (
        `project`,
        `page_id`,
        `page_title`,
        `identifier_type`,
        `identifier_id`,
        `start_date`,
        `end_date`
    ) VALUES (%s, %s, %s, %s, %s, %s, %s)
    """

    if args.create_tables:
        print("Creating tables and indexes")
        create_tables_and_indexes(db_conn.cursor())
        db_conn.commit()

    for file_path in args.input_files:
        print("Reading", file_path, "...")
        input_file = utils.open_compressed_file(file_path)
        cursor = db_conn.cursor()
        with input_file, cursor:
            csvreader = csv.reader(input_file)
            records = (utils.parse_identifier_history_record(r) for r in csvreader)

            records_truncated = (
                (
                    r.project,
                    r.page_id,
                    r.page_title[:255],
                    r.identifier_type[:20],
                    r.identifier_id[:255],
                    r.start_date,
                    r.end_date,
                )
                for r in records
            )

            cursor.executemany(insert_tpl, records_truncated)
    db_conn.commit()
Exemplo n.º 8
0
def main():
    args = parse_args()

    db_conn = pymysql.connect(**args.mysql_url)

    insert_tpl = '''
    INSERT INTO `identifiershistory` (
        `project`,
        `page_id`,
        `page_title`,
        `identifier_type`,
        `identifier_id`,
        `start_date`,
        `end_date`
    ) VALUES (%s, %s, %s, %s, %s, %s, %s)
    '''

    if args.create_tables:
        print('Creating tables and indexes')
        create_tables_and_indexes(db_conn.cursor())
        db_conn.commit()

    for file_path in args.input_files:
        print('Reading', file_path, '...')
        input_file = utils.open_compressed_file(file_path)
        cursor = db_conn.cursor()
        with input_file, cursor:
            csvreader = csv.reader(input_file)
            records = (utils.parse_identifier_history_record(r)
                       for r in csvreader)

            records_truncated = ((
                r.project,
                r.page_id,
                r.page_title[:255],
                r.identifier_type[:20],
                r.identifier_id[:255],
                r.start_date,
                r.end_date,
            ) for r in records)

            cursor.executemany(insert_tpl, records_truncated)
    db_conn.commit()
Exemplo n.º 9
0
def main():
    args = parse_args()

    move_actions = set(['move', 'move_redir'])

    input_file = utils.open_compressed_file(args.input_file)
    # output_file = gzip.open(str(args.output_file), 'wt', encoding='utf-8')
    output_file = sys.stdout

    with input_file, output_file:
        writer = csv.writer(output_file)
        writer.writerow(('timestamp', 'from', 'to'))

        logitems = iter_elems(
            input_file,
            tag='{http://www.mediawiki.org/xml/export-0.10/}logitem',
        )
        for logitem in logitems:
            action = logitem.find(
                '{http://www.mediawiki.org/xml/export-0.10/}action')

            if action.text not in move_actions:
                continue

            params = logitem.find(
                '{http://www.mediawiki.org/xml/export-0.10/}params')
            logtitle = logitem.find(
                '{http://www.mediawiki.org/xml/export-0.10/}logtitle')

            if params is None or logtitle is None:
                continue

            redirect = get_redirect(params.text)
            timestamp = logitem.find(
                '{http://www.mediawiki.org/xml/export-0.10/}timestamp')

            writer.writerow((
                timestamp.text,
                logtitle.text,
                redirect,
            ))
Exemplo n.º 10
0
def main():
    args = parse_args()

    move_actions = set(['move', 'move_redir'])

    input_file = utils.open_compressed_file(args.input_file)
    # output_file = gzip.open(str(args.output_file), 'wt', encoding='utf-8')
    output_file = sys.stdout

    with input_file, output_file:
        writer = csv.writer(output_file)
        writer.writerow(('timestamp', 'from', 'to'))

        logitems = iter_elems(
            input_file,
            tag='{http://www.mediawiki.org/xml/export-0.10/}logitem',
        )
        for logitem in logitems:
            action = logitem.find('{http://www.mediawiki.org/xml/export-0.10/}action')

            if action.text not in move_actions:
                continue

            params = logitem.find('{http://www.mediawiki.org/xml/export-0.10/}params')
            logtitle = logitem.find('{http://www.mediawiki.org/xml/export-0.10/}logtitle')

            if params is None or logtitle is None:
                continue

            redirect = get_redirect(params.text)
            timestamp = logitem.find('{http://www.mediawiki.org/xml/export-0.10/}timestamp')

            writer.writerow((
                timestamp.text,
                logtitle.text,
                redirect,
            ))
Exemplo n.º 11
0
def main():
    args = parse_args()
    print(args)

    args.output_dir.mkdir(parents=True, exist_ok=True)

    # sqlite_conn = sqlite3.connect(
    #     str(args.moves_sqlite),
    #     detect_types=sqlite3.PARSE_DECLTYPES,
    # )
    # sqlite_conn.row_factory = sqlite3.Row
    # ipdb.set_trace()  ######### Break Point ###########
    #
    # r=get_page_periods(moves_conn, 'en', '\'Abd al-Rahman I')
    # periods = get_page_periods(moves_conn, 'en', 'Spanish conquest of Chiapas')
    db_url = args.db_url
    db_vars = dict(
        host=db_url.hostname,
        port=db_url.port or 3306,
        user=db_url.username,
        password=db_url.password or '',
        database=db_url.path.rpartition('/')[-1],
        charset='utf8',
    )
    print(db_vars)
    db_conn = pymysql.connect(
        **db_vars
    )

    counts_finder = pagecountssearch.Finder(args.counts_dataset_dir)
    views_counter = ViewsCounter(
        counts_finder,
        start_period=args.counts_period_start,
        end_period=args.counts_period_end,
        granularity=datetime.timedelta(hours=1),
    )

    for input_file_path in args.input_files:
        input_file = utils.open_compressed_file(input_file_path)
        basename = input_file_path.name

        output_file_path = args.output_dir/basename
        output_file = output_file_path.open('wt', encoding='utf-8')
        with input_file, output_file:
            raw_records = csv.reader(input_file)

            input_records = (parse_record(r) for r in raw_records)

            output_records = (
                OutputRecord(
                    *r,
                    counts_for_page(
                        db_conn,
                        views_counter,
                        r.project,
                        r.page_id,
                        r.page_title,
                        r.start_date,
                        r.end_date,
                    ),
                )
                for r in input_records
            )

            writer = csv.writer(output_file)

            for output_record in output_records:
                writer.writerow(output_record)
Exemplo n.º 12
0
def main():
    args = parse_args()

    db_conn = pymysql.connect(**args.mysql_url)

    insert_tpl = '''
    INSERT INTO `mag_papers` (
        `paper_id`,
        `original_paper_title`,
        `normalized_paper_title`,
        `paper_publish_year`,
        `paper_publish_date`,
        `paper_doi`,
        `original_venue_name`,
        `normalized_venue_name`,
        `journal_id_mapped_to_venue_name`,
        `conference_series_id_mapped_to_venue_name`,
        `paper_rank`
    ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
    '''

    if args.create_tables:
        print('Creating tables and indexes')
        create_tables_and_indexes(db_conn.cursor())
        db_conn.commit()

    print('Reading', args.input_csv, '...')
    input_file = utils.open_compressed_file(args.input_csv)
    cursor = db_conn.cursor()
    with input_file, cursor:
        csvreader = csv.reader(
            input_file,
            delimiter='\t',
            quoting=csv.QUOTE_NONE,
        )
        records = (
            parse_papers_record(r)
            for r in csvreader
        )

        records_truncated = (
            (
                r.paper_id[:50],
                r.original_paper_title[:255],
                r.normalized_paper_title[:255],
                r.paper_publish_year,
                r.paper_publish_date,
                r.paper_doi[:255],
                r.original_venue_name[:255],
                r.normalized_venue_name[:255],
                r.journal_id_mapped_to_venue_name[:255],
                r.converence_series_id_mapped_to_venue_name[:255],
                r.paper_rank,
            ) for r in records
        )

        records_with_progress = frogress.bar(
            records_truncated,
            steps=args.expected_records,
        )
        cursor.executemany(insert_tpl, records_with_progress)
    db_conn.commit()