Пример #1
0
def run_rolodex(input_file, output_file):
	"""
	This program takes an input file of personal information in multiple formats
	It normalize every valid entry and dumps the ordered result into an output file
	@param string input_file	input file name with path from current dir
	@param string output_file	output file name with path from current dir

	"""
	persons = []
	error_indices = []
	normalizer = Normalizer()

	with open(input_file) as input_file:
		for line_number, line in enumerate(input_file, start=1):
			try:
				person = normalizer.normalize(line.rstrip())
				persons.append(person)
			except NormalizationException:
				error_indices.append(line_number)

	sorted_persons = sorted(persons, key=lambda person: person.__str__())

	output_dict = {
			"entries": sorted_persons,
			"errors": error_indices
		}
	logging.info("Completed, please check output file.")
	with open(output_file, 'w') as output_file:
		json.dump(output_dict, output_file, indent=2, sort_keys=True)
Пример #2
0
    def generate(self, edgeCount, tfidf = False, window_size = 0, degree = False, closeness = False, groups= False):
        parser = XMLDataframeParser()
        text = parser.getText("./data/smokingRecords.xml")
        parser.addFeatureFromText(text, "HISTORY OF PRESENT ILLNESS :", "", True, True, "illness")
        df = parser.getDataframe()
        df_xml = parser.removeEmptyEntries(df, "illness")
        normalizer = Normalizer()
        if tfidf:
            if window_size == 0:
                vectorizer = TfidfVectorizer(tokenizer = lambda text: normalizer.normalize(text, True, False), ngram_range = (2, 2))
                mostFreq2Grams = self.get_first_n_words(vectorizer, df_xml.illness, edgeCount)
            else:
                vectorizer = TfidfVectorizer(analyzer = lambda text: self.custom_analyser(text, 2, int(window_size)))
                mostFreq2Grams = self.get_first_n_words(vectorizer, normalizer.normalizeArray(df_xml.illness, True, False), edgeCount)
        else:
            if window_size == 0:
                vectorizer = CountVectorizer(tokenizer = lambda text: normalizer.normalize(text, True, False), ngram_range = (2, 2))
                mostFreq2Grams = self.get_first_n_words(vectorizer, df_xml.illness, edgeCount)
            else:
                vectorizer = CountVectorizer(analyzer = lambda text: self.custom_analyser(text, 2, int(window_size)))
                mostFreq2Grams = self.get_first_n_words(vectorizer, normalizer.normalizeArray(df_xml.illness, True, False), edgeCount)
        df_graph = self.create_dataframe(mostFreq2Grams)
        GF = nx.from_pandas_edgelist(df_graph, 'Node1', 'Node2', ["Weight"])
        

        if degree:
            # calculate degree centrality
            degree_centrality = nx.degree_centrality(GF)
            nx.set_node_attributes(GF, degree_centrality, "degree_centrality")
            
        if closeness:
            # calculate closeness centrality    
            closeness_centrality = nx.closeness_centrality(GF) 
            nx.set_node_attributes(GF, closeness_centrality, "closeness_centrality")

        if groups:
            # calculate partitions
            partition = community.best_partition(GF)
            nx.set_node_attributes(GF, partition, "group")

        payload = json_graph.node_link_data(GF)
        return payload
Пример #3
0
def main(p):
    start = time.time()

    # 选择文件名以'json.gz'结尾的记录
    file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p))

    # TODO 添加文件是否是24个的判断(glob模块)

    for file_name in file_name_list:
        with open(os.path.join(p, file_name), 'r') as f:
            raw_json_file = gzip.GzipFile(fileobj=f)

            record_cleaner = Cleaner()
            record_grouper = Grouper(db)
            record_normalizer = Normalizer(db)
            mongo_helper = MongoHelper(db)
            counter = ActorCounter()
            evaluater = Evaluater()

            # 数据清洗
            record_cleaner.set_dirty_data(raw_json_file)
            record_cleaner.clean()
            clean_record = record_cleaner.get_clean_data()
            log.log('clean record %s' % len(clean_record))
            # 数据处理

            # 分组
            record_grouper.set_records(clean_record)
            record_grouper.group()
            record_actor_exist = record_grouper.get_group_1()
            record_actor_new= record_grouper.get_group_2()
            log.log('record_actor_exist: %s' % len(record_actor_exist))
            log.log('record_actor_new: %s' % len(record_actor_new))


            # 处理记录的actor已存在的记录
            log.log('Begin processing actor-exist records...')
            # 只需要删掉记录的actor_attrs即可
            for record in record_actor_exist:
                del record['actor_attributes']
            log.log('Finished.')


            # 处理记录的actor不存在的记录
            record_normalizer.set_records(record_actor_new)
            record_normalizer.normalize()
            record_actor_new = record_normalizer.get_record_actor_new()
            new_actors = record_normalizer.get_new_actors()

            # 把本地的今日新增的Actor更新到数据库
            actors = new_actors.values()
            mongo_helper.insert_new_actors(actors)

            # 对新增的Actor, 改变Redis中相应的计数
            counter.count_actor_list(actors)

            # 计算每条记录的val
            evaluater.set_records(record_actor_exist)
            evaluater.evaluate()
            val_actor_exist = evaluater.get_val_cache()

            evaluater.set_records(record_actor_new)
            evaluater.evaluate()
            val_actor_new = evaluater.get_val_cache()

            # 将记录插入数据库
            mongo_helper.insert_new_reocrds(record_actor_new)
            mongo_helper.insert_new_reocrds(record_actor_exist)

            # 将今日用户新增的val更新到数据库
            mongo_helper.update_val(val_actor_new)
            mongo_helper.update_val(val_actor_exist)

            record_cleaner.free_mem()
            del record_cleaner
            del record_grouper
            del record_normalizer
            del mongo_helper
            del counter
            del evaluater

    # 生成CSV文件
    util.grcount2csv()

    end = time.time()
    log.log('total: %s s' % (end - start))
Пример #4
0
def main(p):
    start = time.time()

    # 选择文件名以'json.gz'结尾的记录
    file_name_list = filter(lambda x: x.endswith('json.gz'), os.listdir(p))

    # TODO 添加文件是否是24个的判断(glob模块)

    for file_name in file_name_list:
        with open(os.path.join(p, file_name), 'r') as f:
            raw_json_file = gzip.GzipFile(fileobj=f)

            record_cleaner = Cleaner()
            record_grouper = Grouper(db)
            record_normalizer = Normalizer(db)
            mongo_helper = MongoHelper(db)
            counter = ActorCounter()
            evaluater = Evaluater()

            # 数据清洗
            record_cleaner.set_dirty_data(raw_json_file)
            record_cleaner.clean()
            clean_record = record_cleaner.get_clean_data()
            log.log('clean record %s' % len(clean_record))
            # 数据处理

            # 分组
            record_grouper.set_records(clean_record)
            record_grouper.group()
            record_actor_exist = record_grouper.get_group_1()
            record_actor_new = record_grouper.get_group_2()
            log.log('record_actor_exist: %s' % len(record_actor_exist))
            log.log('record_actor_new: %s' % len(record_actor_new))

            # 处理记录的actor已存在的记录
            log.log('Begin processing actor-exist records...')
            # 只需要删掉记录的actor_attrs即可
            for record in record_actor_exist:
                del record['actor_attributes']
            log.log('Finished.')

            # 处理记录的actor不存在的记录
            record_normalizer.set_records(record_actor_new)
            record_normalizer.normalize()
            record_actor_new = record_normalizer.get_record_actor_new()
            new_actors = record_normalizer.get_new_actors()

            # 把本地的今日新增的Actor更新到数据库
            actors = new_actors.values()
            mongo_helper.insert_new_actors(actors)

            # 对新增的Actor, 改变Redis中相应的计数
            counter.count_actor_list(actors)

            # 计算每条记录的val
            evaluater.set_records(record_actor_exist)
            evaluater.evaluate()
            val_actor_exist = evaluater.get_val_cache()

            evaluater.set_records(record_actor_new)
            evaluater.evaluate()
            val_actor_new = evaluater.get_val_cache()

            # 将记录插入数据库
            mongo_helper.insert_new_reocrds(record_actor_new)
            mongo_helper.insert_new_reocrds(record_actor_exist)

            # 将今日用户新增的val更新到数据库
            mongo_helper.update_val(val_actor_new)
            mongo_helper.update_val(val_actor_exist)

            record_cleaner.free_mem()
            del record_cleaner
            del record_grouper
            del record_normalizer
            del mongo_helper
            del counter
            del evaluater

    # 生成CSV文件
    util.grcount2csv()

    end = time.time()
    log.log('total: %s s' % (end - start))
Пример #5
0
def run_pipeline():

    #get training data
    training_data = pd.read_csv('worldbank-data/WDI_Data.csv')
    training_data.set_index(['Country Name', 'Indicator Name'], inplace=True)

    #convert to panel
    panel = training_data.to_panel()
    panel.drop(['Indicator Code', 'Country Code'], axis=0, inplace=True)
    panel = panel.swapaxes(0, 1)

    indicators_to_use = [
        'Agriculture, value added (% of GDP)',
        'Industry, value added (% of GDP)',
        'Services, etc., value added (% of GDP)',
        'Domestic credit provided by financial sector (% of GDP)',
        'GDP growth (annual %)', 'GDP (current US$)', 'Expense (% of GDP)',
        'Inflation, consumer prices (annual %)',
        'Inflation, GDP deflator (annual %)',
        'Total debt service (% of exports of goods, services and primary income)',
        'Current account balance (BoP, current US$)',
        'External balance on goods and services (% of GDP)',
        'Health expenditure, total (% of GDP)', 'Tax revenue (% of GDP)',
        'Gross capital formation (% of GDP)', 'Gross savings (% of GDP)',
        'Net investment in nonfinancial assets (% of GDP)',
        'Bank capital to assets ratio (%)',
        'Bank nonperforming loans to total gross loans (%)',
        'Broad money (% of GDP)',
        'Commercial bank branches (per 100,000 adults)',
        'Deposit interest rate (%)', 'Real interest rate (%)',
        'Risk premium on lending (lending rate minus treasury bill rate, %)',
        'Total reserves (includes gold, current US$)',
        'Unemployment, total (% of total labor force) (modeled ILO estimate)',
        'Interest rate spread (lending rate minus deposit rate, %)'
    ]
    print len(indicators_to_use), 'indicators used'
    panel = panel[:, :, indicators_to_use]

    target_variables = [
        'Agriculture, value added (% of GDP)',
        'Industry, value added (% of GDP)',
        'Services, etc., value added (% of GDP)', 'GDP growth (annual %)',
        'Inflation, GDP deflator (annual %)',
        'Gross capital formation (% of GDP)', 'Gross savings (% of GDP)',
        'Bank capital to assets ratio (%)',
        'Bank nonperforming loans to total gross loans (%)',
        'Deposit interest rate (%)', 'Real interest rate (%)',
        'Risk premium on lending (lending rate minus treasury bill rate, %)',
        'Unemployment, total (% of total labor force) (modeled ILO estimate)',
        'Interest rate spread (lending rate minus deposit rate, %)'
    ]
    #drop useless countries such as samoa, lesoto and so on.
    useful_countries = []
    for country in panel.axes[0]:
        if find_null_percentage(panel[country, :, :]) < 0.7:
            useful_countries.append(country)
    panel = panel.ix[useful_countries, :, :]

    normalizer = Normalizer(panel)
    normalized_panel = normalizer.normalize(panel)

    # #visualize normalization:
    # for indicator in normalized_panel.axes[2]:
    #     plot_hist(indicator, [panel, normalized_panel])

    # select train data
    years_to_validate = 1
    years_to_predict = 10
    years_train = generate_year_list(stop=2016 - years_to_validate)
    years_val = generate_year_list(start=2016 - years_to_validate + 1)
    years_predict = generate_year_list(start=2017,
                                       stop=2016 + years_to_predict)
    train_panel = normalized_panel[:, years_train, :].copy()

    # fill missing values:
    # either banal mean or median filling
    # or sampling with a generative bidirectional LSTM - see https://arxiv.org/abs/1306.1091

    generative_model = dense_generative_model(train_panel,
                                              hidden_layers=[120],
                                              epochs=100)
    sampled_filled_values = iterative_fill(generative_model,
                                           train_panel,
                                           normalizer,
                                           iterations=50,
                                           burn_in=10)
    train_panel.update(sampled_filled_values, overwrite=False)
    # or
    # train_panel.fillna(0, inplace=True)
    # or
    # train_panel = iterative_fill_bLSTM(train_panel)
    # or
    # filled_panel = fill_missing_bLSTM(train_panel, epochs=100)
    # train_panel.update(filled_panel, overwrite=False)
    # or
    # interpolate(train_panel)

    # create 1-step-ahead model
    epochs = 200
    hl = [100, 100]
    print "ARCHITECTURE:", hl
    print 'EPOCHS:', epochs
    X_train = train_panel[:, years_train, :][:, :-1, :]
    y_train = train_panel[:, years_train, :][:, 1:, :]
    model = dense_gradient_model(X_train,
                                 y_train,
                                 hidden_layers=hl,
                                 d=0.2,
                                 patience=50,
                                 epochs=epochs)

    # finally, predict
    for start, year in enumerate(years_val + years_predict):
        predictions = model.predict(train_panel[:,
                                                start + 1:, :].values)[:,
                                                                       -1, :]
        train_panel = train_panel.swapaxes(0, 1)
        new_year_df = pd.DataFrame(data=predictions,
                                   index=train_panel.axes[1],
                                   columns=y_train.axes[2])
        train_panel[year] = new_year_df
        train_panel = train_panel.swapaxes(0, 1)
    print "score:", rmse(
        normalized_panel[:, years_val, target_variables].values,
        train_panel[:, years_val, target_variables].values)

    #revert to original scale and distributions
    train_panel = normalizer.renormalize(train_panel)

    #convert to dataframe, and write relevant information to file
    target_countries = ['Bulgaria', 'Cyprus', 'Albania']
    train_panel = train_panel.swapaxes(0, 1)
    df = train_panel[:, target_countries,
                     target_variables].to_frame(filter_observations=False)
    df.to_csv('Predictions.csv')
Пример #6
0
        _text = text
        for func in funcs:
            _text = func(_text)
        return _text


if __name__ == '__main__':
    import sys
    from normalize import Normalizer

    fd = open(sys.argv[1]) if len(sys.argv) >= 2 else sys.stdin
    ps = JaWikiPreprocess()
    norm = Normalizer()

    for _line in (_.strip() for _ in fd):
        for line in _line.split("。"):
            try:
                conv = ps.execute(line)
                if conv:
                    print(norm.normalize(conv + "。"))
            except KeyboardInterrupt:
                exit()
            except:
                traceback.print_exc()

    # for file in jawiki-latest*.txt; do python
    #       ~/Projects/cabocha/jawiki_preprocess.py $file >preprocess/$file.pre.txt;
    #   done
    # for file in ~/jawiki/preprocess/jawiki-latest*.txt;
    #    do python ~/Projects/cabocha/case_mongo.py $file || exit; done