def main(): """ set up logging """ now = datetime.datetime.now(timezone('US/Pacific')).strftime("%Y-%m-%d") logger = logging_config.get_logger(_dir=now, name="ph_feature_extraction", console_level=logging.ERROR) """ set up environment vars """ os.environ['DB_CONFIG'] = os.path.abspath('db/cfg/dbsetup.yml') session = setup_db(os.environ['DB_CONFIG']) os.environ['FEATURES'] = os.path.abspath('features.csv') entries = extract_all_features(session, logger) entries = clean_all_features(entries) write_all_features(os.environ['FEATURES'], entries)
def main(): """ set up logging """ now = datetime.datetime.now(timezone('US/Pacific')).strftime("%Y-%m-%d") logger = logging_config.get_logger(_dir=now, name="ph_feature_extraction", console_level=logging.ERROR) """ set up environment vars """ os.environ['DB_CONFIG'] = os.path.abspath('db/cfg/dbsetup.yml') session = setup_db(os.environ['DB_CONFIG']) csv_directory = os.getcwd()[:-8] + 'dataset\\' features_not_discretized = 'features_not_discretized.csv' csv_path = os.path.join(csv_directory, features_not_discretized) os.environ['FEATURES'] = os.path.abspath(csv_path) """ Extract all features (Presentation, Reputation, Time, Affect and Linguistic) """ entries = extract_all_features(session, logger) entries = clean_features(entries) write_all_features(os.environ['FEATURES'], entries) realize_topic_modeling(csv_path) discretize_continuous_variables(csv_path)
return ProductHuntClient(key, secret, uri, token) if __name__ == '__main__': now = datetime.datetime.now(timezone('US/Pacific')).strftime("%Y-%m-%d") now_dt = datetime.datetime.strptime(now, '%Y-%m-%d').date() day = None day_dt = None newest = False update = False pid = None phm = None ph_credentials = 'credentials_miner.yml' help_string = 'Usage:\n\tpython ph_miner.py [-d|--day=<YYYY-MM-DD>] [-p|--postid=N] [-n|--newest] [-u|--update] ' \ '[-c|--credentials=credentials.yml] [--h|--help]' logger = logging_config.get_logger(_dir=now, name="ph_miner", console_level=logging.INFO) exit_code = 0 try: opts, _ = getopt(sys.argv[1:], "hd:p:nuc:", ["help", "day=", "postid=", "newest", "update", "credentials="]) for opt, arg in opts: if opt in ("-h", "--help"): print(help_string) exit(0) elif opt in ("-d", "--day"): day = arg day_dt = datetime.datetime.strptime(day, "%Y-%m-%d").date() elif opt in ("-p", "--postid"): pid = int(arg) elif opt in ("-n", "--newest"): newest = True
import logging import os from pytz import timezone from scrapy.crawler import CrawlerRunner from scrapy.utils.log import configure_logging from scrapy.utils.project import get_project_settings from twisted.internet import defer from twisted.internet import reactor from db.orm.tables import Post from logger import logging_config from review_user_crawler.spiders.producthunt import ReviewSpider, UserSpider logger = logging_config.get_logger(_dir=datetime.datetime.now( timezone('US/Pacific')).strftime("%Y-%m-%d"), name="ph_crawler", console_level=logging.INFO) """ Code to work around the twisted.internet.error.ReactorNotRestartable limitation of the Twisted library See here: https://doc.scrapy.org/en/latest/topics/practices.html """ class CrawlersLauncher: def __init__(self, session): self.session = session self.review_urls = [] self.profile_urls = [] settings = self.__configure_project() self.runner = CrawlerRunner(settings=settings)
if not e[12]: has_website = 0 e[12] = has_website _cleaned_entries.append(e) return _cleaned_entries def write_all_features(outfile, _entries): writer = CsvWriter(outfile) header = ['post_id', 'is_featured', 'score', 'created_at_day', 'created_at_daytime', 'hunter_id', 'hunter_followers', 'hunter_has_twitter', 'hunter_has_website', 'maker_id', 'maker_followers', 'maker_has_twitter', 'maker_has_website'] writer.writerow(header) writer.writerows(_entries) writer.close() if __name__ == '__main__': """ set up logging """ now = datetime.datetime.now(timezone('US/Pacific')).strftime("%Y-%m-%d") logger = logging_config.get_logger(_dir=now, name="ph_feature_extraction", console_level=logging.ERROR) """ set up environment vars """ os.environ['DB_CONFIG'] = os.path.abspath('db/cfg/dbsetup.yml') session = setup_db(os.environ['DB_CONFIG']) os.environ['FEATURES'] = os.path.abspath('temp.csv') entries = extract_all_features(session, logger) entries = clean_all_features(entries) write_all_features(os.environ['FEATURES'], entries)
liwc_errors = False if all_emails: resume_month = already_parsed_uid_project_month(aliases, p.name) liwc_errors = get_score_by_month(uid, p.name, all_emails, resume_month, nlon, nlon_model) del all_emails else: logger.debug( 'No emails from %s <%s> to project \'%s\' mailing lists' % (uid, alias_email_addresses, p.name)) logger.info('Done processing project %s' % p.name) if liwc_errors: return True return False if __name__ == '__main__': logger = logging_config.get_logger('big5_personality', console_level=logging.DEBUG) SessionWrapper.load_config('../db/cfg/setup.yml') session = SessionWrapper.new(init=True) if len(sys.argv) >= 2: tool = sys.argv[1] else: logger.error('Missing mandatory first param for tool: \'liwc15\' or \'p_insights\' expected') sys.exit(-1) if len(sys.argv) > 2 and sys.argv[2] == 'reset': reset_personality_table() try: """ boolean var storing presence of liwc errors """ liwc_errors = main() if tool == 'liwc15':
logger.info('Already parsed user: %s' % username) session.rollback() def already_parsed_users(): return session.query(UsersLocation).count() def reset_users_location_table(): session.query(UsersLocation).delete() session.commit() logger.info('Done resetting table') if __name__ == '__main__': logger = logging_config.get_logger('users_location', console_level=logging.DEBUG) SessionWrapper.load_config('../db/cfg/setup.yml') session = SessionWrapper.new(init=True) if len(sys.argv) > 1 and sys.argv[1] == 'reset': reset_users_location_table() try: already_parsed_users = already_parsed_users() token = open("github-api-tokens.txt", "r").readline() g = Github(token) count_users = 0 for user in get_github_users(): if count_users == already_parsed_users:
UsersLocation.username).filter( UsersLocation.location.isnot(None)).all() id = -offset for g in githubbers: id -= 1 continent = geo.extract_continent(unidecode(g.location)) if continent: row = UsersRegionId(id=id, continent=continent, username=g.username, email=g.email, name=g.name) session.add(row) session.commit() EMAILERS_OFFSET = 900000 GITHUBBERS_OFFSET = 1000000 if __name__ == '__main__': logger = logging_config.get_logger('unmask_aliases') SessionWrapper.load_config('../db/cfg/setup.yml') session = SessionWrapper.new(init=True) setup_emailers_id(EMAILERS_OFFSET) setup_githubbers_id_location(GITHUBBERS_OFFSET) aliases, everyone = unmask(sys.argv[1:]) logger.info('Done, looking for unmatched users') unmatched = find_missing_aliases(aliases, everyone) logger.info('Done: unmatched %s' % len(unmatched))
def get_already_parsed_projects(): seen = set() SessionWrapper.load_config('../db/cfg/setup.yml') s = SessionWrapper.new(init=True) res = s.query(PullRequest.slug).distinct() for r in res: seen.add(r.slug) return seen if __name__ == '__main__': pr_file = 'tmp_pullrequests.csv' # comment_file = 'tmp_comments.csv' logger = logging_config.get_logger('pr_extractor') try: tokens = Tokens() tokens_iter = tokens.iterator() manager = Manager() tokens_queue = manager.Queue() for token in tokens_iter: tokens_queue.put(token) tokens_map = manager.dict() extractor = PrAndCommentExtractor(tokens, tokens_queue, tokens_map) print("Retrieving the list of cloned GitHub project") slugs = get_github_slugs(sys.argv[1]) print("%s" % len(slugs)) print("Retrieving the list of project already analyzed") extractor.seen = get_already_parsed_projects()
reward=result["reward"].replace(",", ".")) session.add(ls) session.commit() logger.info('Imported results from file: \'%s\'' % result['Filename']) def reset_table(): if dictionary == '2007': session.query(Liwc2007Scores).delete() elif dictionary == '2015': session.query(Liwc2015Scores).delete() if __name__ == '__main__': logger = logging_config.get_logger('save_liwc_scores', console_level=logging.DEBUG) SessionWrapper.load_config('../db/cfg/setup.yml') session = SessionWrapper.new(init=True) if len(sys.argv) >= 3: dictionary = sys.argv[2] else: logger.error('Missing mandatory params') sys.exit(-1) reset_table() try: with open(sys.argv[1]) as csvfile: reader = csv.DictReader(csvfile) for row in reader:
dictionary. :return: dictionary of unmasked dev ids (key) and continent (value) """ res = session.query(UsersRegionId.id, UsersRegionId.continent).all() uids_cont = dict() for _r in res: try: unmasked_id = alias_map[_r.id] uids_cont.update({unmasked_id: _r.continent}) except KeyError: pass return uids_cont if __name__ == '__main__': logger = logging_config.get_logger('export', console_level=logging.DEBUG) SessionWrapper.load_config('../db/cfg/setup.yml') session = SessionWrapper.new(init=True) alias_map = load_alias_map('../unmasking/idm/dict/alias_map.dict') uids_continent = load_continent_info() if len(sys.argv) >= 2: tool = sys.argv[1] elif len(sys.argv) < 2: logger.error( 'Missing mandatory first param for tool: \'liwc07\', \'liwc15\', or \'p_insights\' expected' ) sys.exit(-1) save_personality_results() save_commit_results()
with open(filename, "rb") as f: unpickler = pickle.Unpickler(f) aliases = unpickler.load() return aliases def get_alias_ids(_map, uid): aliases = set() for key in _map.keys(): if _map[key] == uid and key != uid: aliases.add(key) return list(aliases) def reset_commit_history_table(): session.query(CommitHistoryDevProject).delete() session.commit() if __name__ == '__main__': logger = logging_config.get_logger('commit_history') SessionWrapper.load_config('../db/cfg/setup.yml') session = SessionWrapper.new(init=True) reset_commit_history_table() alias_map = load_alias_map('../unmasking/idm/dict/alias_map.dict') try: main() except KeyboardInterrupt: logger.error('Received Ctrl-C or other break signal. Exiting.')
format(name, email, db_repo.slug)) db_repo.min_commit = min_commit db_repo.max_commit = max_commit db_repo.total_commits = total_commits session.add(db_repo) session.commit() logger.info('Done') return slug except Exception as e: logger.error(msg="{0}: unknown error:\t{1}".format(slug, e)) traceback.print_exc() finally: return slug if __name__ == '__main__': logging.basicConfig() logger = logging_config.get_logger('commit_analyzer', logging.DEBUG) # create a new session and init db tables SessionWrapper.load_config('../db/cfg/setup.yml') SessionWrapper.new(init=True) repos = [d for d in os.listdir(os.path.abspath(sys.argv[1]))] for r in repos: parse_commits(r, repos_folder=sys.argv[1])
import nltk from logger import logging_config from unmasking.geolite2 import cities_contries_continents, cities, countries, continents, SelectedFields logger = logging_config.get_logger('geo') def extract_continent(location): nes = find_places(location) city = None likely_city = False country = None likely_country = False continent = None likely_continent = False for n in nes: n = n.lower() if n in cities and not likely_city: city = n likely_city = True if n in countries and not likely_country: country = n likely_country = True if n in continents and not likely_continent: continent = n likely_continent = True if not continent: if country: