Exemplo n.º 1
0
def data_retriever(data_source,
                   query,
                   save_filename,
                   *,
                   lang='',
                   proxy=None,
                   remove_duplicates=False,
                   twapi_max=None,
                   twapi_sleep_time=0,
                   twscrape_poolsize=20,
                   twscrape_begindate=None,
                   ghapi_org=None,
                   ghapi_since=None,
                   soapi_begindate=None):
    cl.section('Data Retriever')
    cl.info('Starting to retrieve query: %s, or org: %s' % (query, ghapi_org))
    cl.info('From data source: %s' % data_source)
    cl.info('Using proxy: %s' % proxy)
    cl.info('Remove duplicates: %s' % remove_duplicates)

    if proxy:
        os.environ['HTTP_PROXY'] = proxy
        os.environ['HTTPS_PROXY'] = proxy

    if data_source == 'twitter_standard_api':
        data = twapi_search(query,
                            twapi_max,
                            sleep_time=twapi_sleep_time,
                            lang=lang)
    elif data_source == 'twitterscraper':
        data = twscrape_search(query,
                               lang=lang,
                               poolsize=twscrape_poolsize,
                               begindate=twscrape_begindate)
    elif data_source == 'github_api':
        data = github_issue_org_fetch(ghapi_org, ghapi_since)
    elif data_source == 'stackoverflow_api':
        data = soapi_search(query, begindate=soapi_begindate)
    else:
        cl.error('Data source %r is not implemented' % data_source)
        sys.exit(-1)

    if remove_duplicates:
        data = iterator_aggregate_list(data)
        data_no_duplicate_text = remove_duplicate_text(data)
        cl.info('Exporting data without duplicate text')
        export_csv(data_no_duplicate_text, data_source_file(save_filename))

        save_filename_full = name_with_title_suffix(save_filename, '-full')
        cl.info('Exporting full data')
        export_csv(data, data_source_file(save_filename_full))
    else:
        export_csv(data, data_source_file(save_filename))
def load_all(modeldesc, sourcedesc):
    modelfilename = model_file('ldamodel-%s' % modeldesc)
    ldamodel = LdaMulticore.load(modelfilename)

    corpus = file_read_json(model_file('ldacorpus-%s.json' % modeldesc))

    prep_items = file_read_json(data_source_file(sourcedesc + '.prep.json'))

    sourcefilename = data_source_file(sourcedesc + '.csv')
    reader = csv_reader(sourcefilename)
    source_texts = {row['id']: row['text'] for row in reader}

    return ldamodel, corpus, prep_items, source_texts
def text_preprocessor(input_filename,
                      *,
                      preprocessor_cls='TextPreprocessor',
                      custom_stop_words=None,
                      lem_ignore_patterns=None,
                      remove_duplicates=False):
    cl.section('Text Preprocessor')

    input_filename = data_source_file(input_filename)
    preprocessor_cls = globals()[preprocessor_cls]

    with TimeMeasure('preprocess_text'):
        result = preprocess_csv(input_filename,
                                preprocessor_cls=preprocessor_cls,
                                custom_stop_words=custom_stop_words,
                                lem_ignore_patterns=lem_ignore_patterns)

        if remove_duplicates:
            result = remove_duplicate_text(result)

        result = tuple(result)
        cl.info('Effective data size: %d' % len(result))

    with TimeMeasure('save_preprocessed'):
        save_preprocessed(result, input_filename)
Exemplo n.º 4
0
def remove_file(f):
    with db.atomic():
        filepath = data_source_file(f.physical_name)
        f.delete_instance()

    with contextlib.suppress(FileNotFoundError):
        os.remove(filepath)
Exemplo n.º 5
0
def retweets_recover(csvfilename):
    cl.section('Retweets Recover')
    cl.info('Recovering file: %s' % csvfilename)

    csvfilename = data_source_file(csvfilename)
    result = recover_from_csv(csvfilename)
    exportfilename = name_with_title_suffix(csvfilename, '-recovered')
    export_csv(result, exportfilename)
    return os.path.basename(exportfilename)
Exemplo n.º 6
0
def get_file_path(f):
    if f.file_type == 'source':
        path = data_source_file(f.physical_name)
    elif f.file_type in ('plot', 'report'):
        path = report_file(f.physical_name)
    else:
        path = ''

    return os.path.isfile(path) and path
def visualization_twlda(keyword, desc, desc_show, userinfofile, topusers=20,
                        encoding='utf-8', portable=True, open_browser=True):
    cl.section('Twitter-LDA Visualization')

    user_topic = parse_user_topic(desc, encoding=encoding)
    topic_words = parse_topic_words(desc, encoding=encoding)
    user_info = load_user_info(data_source_file(userinfofile))
    result = organize_data('test', user_topic, topic_words, user_info,
                           topusers)
    return export_html(keyword, desc_show, result, portable, open_browser)
def text_preprocessor_user(sourcedesc):
    cl.section('Text Preprocessor Grouping By User')

    assert re.fullmatch(r'[-_0-9a-zA-Z+]+', sourcedesc)

    csvfilename = data_source_file('%s.csv' % sourcedesc)

    with TimeMeasure('preprocess_text'):
        result = list(preprocess_csv(csvfilename))

    with TimeMeasure('save_preprocessed'):
        savefilename = name_with_title_suffix(csvfilename, '-user')
        export_csv(result, savefilename)
Exemplo n.º 9
0
def random_sampler(csvfilename, amount):
    cl.section('Data Random Sampler')
    cl.info('Random sampling file: %s' % csvfilename)
    cl.info('Amount: %d' % amount)

    csvfilename = data_source_file(csvfilename)
    data = list(csv_reader(csvfilename))

    random.shuffle(data)
    data = data[:amount]

    exportfilename = name_with_title_suffix(csvfilename, '-sample-%d' % amount)
    export_csv(data, exportfilename)
def text_preprocessor_twlda(sourcedesc,
                            *,
                            tweet_min_length=3,
                            user_min_tweets=1,
                            remove_duplicates=False):
    cl.section('Text Preprocessor For Twitter-LDA')

    assert re.fullmatch(r'[-_0-9a-zA-Z+]+', sourcedesc)

    input_filename = data_source_file('%s.csv' % sourcedesc)

    with TimeMeasure('preprocess_text'):
        prepdata, sourcedata = preprocess_csv(input_filename, tweet_min_length,
                                              user_min_tweets,
                                              remove_duplicates)

    with TimeMeasure('save_preprocessed'):
        save_preprocessed(prepdata, sourcedata)
Exemplo n.º 11
0
def get_usernames(tweets_file):
    return list(
        set(row['user'] for row in csv_reader(data_source_file(tweets_file))))
Exemplo n.º 12
0
def user_info_retriever(usernames, csvfilename):
    cl.section('Twitter User Info Retriever')

    csvfilename = data_source_file(csvfilename)
    result = retrieve_user_info(usernames)
    export_csv(result, csvfilename)
Exemplo n.º 13
0
def lda_topic_model(input_filename, keyword, size, *, num_topics,
                    iterations=50, passes=1, chunksize=2000, eval_every=10,
                    verbose=False, gamma_threshold=0.001, filter_no_below=5,
                    filter_no_above=0.5, filter_keep_n=100000,
                    open_browser=True):
    cl.section('LDA Topic Model Training')
    cl.info('Keyword: %s' % keyword)
    cl.info('Data size: %d' % size)
    cl.info('Number of topics: %d' % num_topics)
    cl.info('Iterations: %d' % iterations)
    cl.info('Passes: %d' % passes)
    cl.info('Chunk size: %d' % chunksize)
    cl.info('Eval every: %s' % eval_every)
    cl.info('Verbose: %s' % verbose)
    cl.info('Gamma Threshold: %f' % gamma_threshold)
    cl.info('Filter no below: %d' % filter_no_below)
    cl.info('Filter no above: %f' % filter_no_above)
    cl.info('Filter keep n: %d' % filter_keep_n)

    assert re.fullmatch(r'[-_0-9a-zA-Z+]+', keyword)

    input_filename = data_source_file(input_filename)
    description = '%s-%d-%d-%dx%d-%s' % (keyword, size, num_topics, iterations,
                                         passes, time.strftime('%Y%m%d%H%M%S'))

    if verbose:
        log_filename = log_file('ldalog-%s.log' % description)
        logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                            level=logging.DEBUG, filename=log_filename)
        cl.info('Writing logs into file: %s' % log_filename)

    with TimeMeasure('load_preprocessed_text'):
        preprocessed_texts = file_read_json(input_filename)
        preprocessed_texts = [item[1] for item in preprocessed_texts]

    with TimeMeasure('gen_dict_corpus'):
        cl.progress('Generating dictionary and corpus...')

        dictionary = Dictionary(preprocessed_texts, prune_at=None)
        dictionary.filter_extremes(no_below=filter_no_below,
                                   no_above=filter_no_above,
                                   keep_n=filter_keep_n)
        dictionary.compactify()

        corpus = [dictionary.doc2bow(text) for text in preprocessed_texts]

        corpusfilename = model_file('ldacorpus-%s.json' % description)
        file_write_json(corpusfilename, corpus)
        cl.success('Corpus saved as: %s' % corpusfilename)

    with TimeMeasure('training'):
        cl.progress('Performing training...')

        with NoConsoleOutput():
            ldamodel = LdaMulticore(corpus, workers=N_WORKERS,
                                    id2word=dictionary, num_topics=num_topics,
                                    iterations=iterations, passes=passes,
                                    chunksize=chunksize, eval_every=eval_every,
                                    gamma_threshold=gamma_threshold,
                                    alpha='symmetric', eta='auto')

        cl.success('Training finished.')

    with TimeMeasure('save_model'):
        modelfilename = 'ldamodel-%s' % description
        ldamodel.save(model_file(modelfilename))
        cl.success('Model saved as: %s' % modelfilename)

    with TimeMeasure('measure_coherence'):
        cl.progress('Measuring topic coherence...')
        measure_coherence(ldamodel, preprocessed_texts, corpus, dictionary)

    with TimeMeasure('vis_save'):
        cl.progress('Preparing visualization...')
        vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dictionary)
        htmlfilename = 'ldavis-%s.html' % description
        htmlfilename = report_file(htmlfilename)
        pyLDAvis.save_html(vis, htmlfilename)
        cl.success('Visualized result saved in file: %s' % htmlfilename)

    if open_browser:
        open_html_in_browser(htmlfilename)