def vectorize(start, stop): """ Master function :return: None """ sets = eld("sets") extracted = eld("extracted2")[start:stop] td = get_tuple_dict(sets) outdir = "tag-vectors" if not exists(outdir): mkdir(outdir) for fil in extracted: try: group_by_tags(fil, td, outdir) except (TypeError, ValueError, OverflowError): continue
def get_hashtags(): master_vectors = eld('master_vectors') tag_lists = [] for vector_list in master_vectors: with gzip.open(vector_list) as f: js = json.load(f) hashtags = js.get('hashtags') if len(hashtags) > 2: tag_lists.append(set(hashtags)) return tag_lists
def fix_all(directory): exclude = [os.path.join(directory, fil) for fil in os.listdir(OUT_DIR)] def fil_is_valid(fil): return fil not in exclude and bool(re.match(r'[\w]+\.json', os.path.split(fil)[-1])) fils = filter(fil_is_valid, eld(directory)) num_fils = len(fils) for idx, fil in enumerate(fils): print "fixing {} ({} of {})".format(fil, idx, num_fils) fix_js(fil)
def master_nonvectorize(): """ Equivalent of master_vectorize for get_master_nonvectors: calls get_master_nonvectors for all files in tag_values :return: None (writes to master_values) """ outdir = "master_values" if not exists(outdir): mkdir(outdir) clustered_values = eld("tag_values") for fil in clustered_values: get_master_nonvectors(fil, join(outdir, split(fil)[1]))
def extract_folder(val, folder, outfile, user=False): """ Calls extractor on every file in a folder :param val: the value to extract :type val: string :param folder: the path to the folder :type folder: string (path to folder) :param outfile: the file to write to :type outfile: string (file name) :return: None (writes to outfile) """ for f in eld(folder): extractor(val, f, outfile, user)
def fix_all(directory): exclude = [os.path.join(directory, fil) for fil in os.listdir(OUT_DIR)] def fil_is_valid(fil): return fil not in exclude and bool( re.match(r'[\w]+\.json', os.path.split(fil)[-1])) fils = filter(fil_is_valid, eld(directory)) num_fils = len(fils) for idx, fil in enumerate(fils): print "fixing {} ({} of {})".format(fil, idx, num_fils) fix_js(fil)
def process_all(outfile='arff_data.txt', dry_run=False): fils = eld('master_vectors') s = '' for fil in fils: entry = process_entry(fil) if not entry: print """WARNING: No corresponding value file found for {fil}""".format(fil=fil) else: s += process_entry(fil) if dry_run: print s else: with open(outfile, 'w') as f: f.write(s)
def non_vectorize(start, stop): """ vectorize equivalent for non-vector values :return: None """ extracted = eld("extracted2")[start:stop] outdir = "tag_values" if not exists(outdir): mkdir(outdir) for fil in extracted: try: group_non_vectors(fil, outdir) except (TypeError, ValueError, OverflowError): continue
def __init__(self, fnames=None, folder=None): """ :param fnames: list of JSON files to process :type fnames: list of strings (file names) :param folder: path to a folder containing JSON files to process :type folder: string """ self.fnames = [] if fnames: self.fnames += fnames if folder: self.fnames += eld(folder) self.domain_pat = re.compile(r"https?://([\w\d\.\-]+\.\w{2,3})") self.clean_pat = re.compile(r"(@|#|http)\S+") self.parsed = [] self.tweets = []
def __init__(self, fnames=None, folder=None): """ :param fnames: list of JSON files to process :type fnames: list of strings (file names) :param folder: path to a folder containing JSON files to process :type folder: string """ self.fnames = [] if fnames: self.fnames += fnames if folder: self.fnames += eld(folder) self.domain_pat = re.compile(r'https?://([\w\d\.\-]+\.\w{2,3})') self.clean_pat = re.compile(r'(@|#|http)\S+') self.parsed = [] self.tweets = []
def process_dir(directory, use_testdb=False, show_sql=True): """ Process a whole directory. :param directory: the directory to process :type directory: str (path to a directory) :param test: whether to make the Inserter instance use the test db/cache settings :param test: bool :return: None """ inserter = Inserter(use_testdb, show_sql) try: fils = eld(directory) for fil in fils: process_fil(fil, inserter) print "Processed {fil}".format(fil=fil) finally: inserter.DB.commit()