def data_retriever(data_source, query, save_filename, *, lang='', proxy=None, remove_duplicates=False, twapi_max=None, twapi_sleep_time=0, twscrape_poolsize=20, twscrape_begindate=None, ghapi_org=None, ghapi_since=None, soapi_begindate=None): cl.section('Data Retriever') cl.info('Starting to retrieve query: %s, or org: %s' % (query, ghapi_org)) cl.info('From data source: %s' % data_source) cl.info('Using proxy: %s' % proxy) cl.info('Remove duplicates: %s' % remove_duplicates) if proxy: os.environ['HTTP_PROXY'] = proxy os.environ['HTTPS_PROXY'] = proxy if data_source == 'twitter_standard_api': data = twapi_search(query, twapi_max, sleep_time=twapi_sleep_time, lang=lang) elif data_source == 'twitterscraper': data = twscrape_search(query, lang=lang, poolsize=twscrape_poolsize, begindate=twscrape_begindate) elif data_source == 'github_api': data = github_issue_org_fetch(ghapi_org, ghapi_since) elif data_source == 'stackoverflow_api': data = soapi_search(query, begindate=soapi_begindate) else: cl.error('Data source %r is not implemented' % data_source) sys.exit(-1) if remove_duplicates: data = iterator_aggregate_list(data) data_no_duplicate_text = remove_duplicate_text(data) cl.info('Exporting data without duplicate text') export_csv(data_no_duplicate_text, data_source_file(save_filename)) save_filename_full = name_with_title_suffix(save_filename, '-full') cl.info('Exporting full data') export_csv(data, data_source_file(save_filename_full)) else: export_csv(data, data_source_file(save_filename))
def load_all(modeldesc, sourcedesc): modelfilename = model_file('ldamodel-%s' % modeldesc) ldamodel = LdaMulticore.load(modelfilename) corpus = file_read_json(model_file('ldacorpus-%s.json' % modeldesc)) prep_items = file_read_json(data_source_file(sourcedesc + '.prep.json')) sourcefilename = data_source_file(sourcedesc + '.csv') reader = csv_reader(sourcefilename) source_texts = {row['id']: row['text'] for row in reader} return ldamodel, corpus, prep_items, source_texts
def text_preprocessor(input_filename, *, preprocessor_cls='TextPreprocessor', custom_stop_words=None, lem_ignore_patterns=None, remove_duplicates=False): cl.section('Text Preprocessor') input_filename = data_source_file(input_filename) preprocessor_cls = globals()[preprocessor_cls] with TimeMeasure('preprocess_text'): result = preprocess_csv(input_filename, preprocessor_cls=preprocessor_cls, custom_stop_words=custom_stop_words, lem_ignore_patterns=lem_ignore_patterns) if remove_duplicates: result = remove_duplicate_text(result) result = tuple(result) cl.info('Effective data size: %d' % len(result)) with TimeMeasure('save_preprocessed'): save_preprocessed(result, input_filename)
def remove_file(f): with db.atomic(): filepath = data_source_file(f.physical_name) f.delete_instance() with contextlib.suppress(FileNotFoundError): os.remove(filepath)
def retweets_recover(csvfilename): cl.section('Retweets Recover') cl.info('Recovering file: %s' % csvfilename) csvfilename = data_source_file(csvfilename) result = recover_from_csv(csvfilename) exportfilename = name_with_title_suffix(csvfilename, '-recovered') export_csv(result, exportfilename) return os.path.basename(exportfilename)
def get_file_path(f): if f.file_type == 'source': path = data_source_file(f.physical_name) elif f.file_type in ('plot', 'report'): path = report_file(f.physical_name) else: path = '' return os.path.isfile(path) and path
def visualization_twlda(keyword, desc, desc_show, userinfofile, topusers=20, encoding='utf-8', portable=True, open_browser=True): cl.section('Twitter-LDA Visualization') user_topic = parse_user_topic(desc, encoding=encoding) topic_words = parse_topic_words(desc, encoding=encoding) user_info = load_user_info(data_source_file(userinfofile)) result = organize_data('test', user_topic, topic_words, user_info, topusers) return export_html(keyword, desc_show, result, portable, open_browser)
def text_preprocessor_user(sourcedesc): cl.section('Text Preprocessor Grouping By User') assert re.fullmatch(r'[-_0-9a-zA-Z+]+', sourcedesc) csvfilename = data_source_file('%s.csv' % sourcedesc) with TimeMeasure('preprocess_text'): result = list(preprocess_csv(csvfilename)) with TimeMeasure('save_preprocessed'): savefilename = name_with_title_suffix(csvfilename, '-user') export_csv(result, savefilename)
def random_sampler(csvfilename, amount): cl.section('Data Random Sampler') cl.info('Random sampling file: %s' % csvfilename) cl.info('Amount: %d' % amount) csvfilename = data_source_file(csvfilename) data = list(csv_reader(csvfilename)) random.shuffle(data) data = data[:amount] exportfilename = name_with_title_suffix(csvfilename, '-sample-%d' % amount) export_csv(data, exportfilename)
def text_preprocessor_twlda(sourcedesc, *, tweet_min_length=3, user_min_tweets=1, remove_duplicates=False): cl.section('Text Preprocessor For Twitter-LDA') assert re.fullmatch(r'[-_0-9a-zA-Z+]+', sourcedesc) input_filename = data_source_file('%s.csv' % sourcedesc) with TimeMeasure('preprocess_text'): prepdata, sourcedata = preprocess_csv(input_filename, tweet_min_length, user_min_tweets, remove_duplicates) with TimeMeasure('save_preprocessed'): save_preprocessed(prepdata, sourcedata)
def get_usernames(tweets_file): return list( set(row['user'] for row in csv_reader(data_source_file(tweets_file))))
def user_info_retriever(usernames, csvfilename): cl.section('Twitter User Info Retriever') csvfilename = data_source_file(csvfilename) result = retrieve_user_info(usernames) export_csv(result, csvfilename)
def lda_topic_model(input_filename, keyword, size, *, num_topics, iterations=50, passes=1, chunksize=2000, eval_every=10, verbose=False, gamma_threshold=0.001, filter_no_below=5, filter_no_above=0.5, filter_keep_n=100000, open_browser=True): cl.section('LDA Topic Model Training') cl.info('Keyword: %s' % keyword) cl.info('Data size: %d' % size) cl.info('Number of topics: %d' % num_topics) cl.info('Iterations: %d' % iterations) cl.info('Passes: %d' % passes) cl.info('Chunk size: %d' % chunksize) cl.info('Eval every: %s' % eval_every) cl.info('Verbose: %s' % verbose) cl.info('Gamma Threshold: %f' % gamma_threshold) cl.info('Filter no below: %d' % filter_no_below) cl.info('Filter no above: %f' % filter_no_above) cl.info('Filter keep n: %d' % filter_keep_n) assert re.fullmatch(r'[-_0-9a-zA-Z+]+', keyword) input_filename = data_source_file(input_filename) description = '%s-%d-%d-%dx%d-%s' % (keyword, size, num_topics, iterations, passes, time.strftime('%Y%m%d%H%M%S')) if verbose: log_filename = log_file('ldalog-%s.log' % description) logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.DEBUG, filename=log_filename) cl.info('Writing logs into file: %s' % log_filename) with TimeMeasure('load_preprocessed_text'): preprocessed_texts = file_read_json(input_filename) preprocessed_texts = [item[1] for item in preprocessed_texts] with TimeMeasure('gen_dict_corpus'): cl.progress('Generating dictionary and corpus...') dictionary = Dictionary(preprocessed_texts, prune_at=None) dictionary.filter_extremes(no_below=filter_no_below, no_above=filter_no_above, keep_n=filter_keep_n) dictionary.compactify() corpus = [dictionary.doc2bow(text) for text in preprocessed_texts] corpusfilename = model_file('ldacorpus-%s.json' % description) file_write_json(corpusfilename, corpus) cl.success('Corpus saved as: %s' % corpusfilename) with TimeMeasure('training'): cl.progress('Performing training...') with NoConsoleOutput(): ldamodel = LdaMulticore(corpus, workers=N_WORKERS, id2word=dictionary, num_topics=num_topics, iterations=iterations, passes=passes, chunksize=chunksize, eval_every=eval_every, gamma_threshold=gamma_threshold, alpha='symmetric', eta='auto') cl.success('Training finished.') with TimeMeasure('save_model'): modelfilename = 'ldamodel-%s' % description ldamodel.save(model_file(modelfilename)) cl.success('Model saved as: %s' % modelfilename) with TimeMeasure('measure_coherence'): cl.progress('Measuring topic coherence...') measure_coherence(ldamodel, preprocessed_texts, corpus, dictionary) with TimeMeasure('vis_save'): cl.progress('Preparing visualization...') vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary) htmlfilename = 'ldavis-%s.html' % description htmlfilename = report_file(htmlfilename) pyLDAvis.save_html(vis, htmlfilename) cl.success('Visualized result saved in file: %s' % htmlfilename) if open_browser: open_html_in_browser(htmlfilename)