Пример #1
0
def vectorize(start, stop):
    """
    Master function
    :return: None
    """
    sets = eld("sets")
    extracted = eld("extracted2")[start:stop]
    td = get_tuple_dict(sets)
    outdir = "tag-vectors"
    if not exists(outdir):
        mkdir(outdir)
    for fil in extracted:
        try:
            group_by_tags(fil, td, outdir)
        except (TypeError, ValueError, OverflowError):
            continue
Пример #2
0
def vectorize(start, stop):
    """
    Master function
    :return: None
    """
    sets = eld("sets")
    extracted = eld("extracted2")[start:stop]
    td = get_tuple_dict(sets)
    outdir = "tag-vectors"
    if not exists(outdir):
        mkdir(outdir)
    for fil in extracted:
        try:
            group_by_tags(fil, td, outdir)
        except (TypeError, ValueError, OverflowError):
            continue
Пример #3
0
def get_hashtags():
    master_vectors = eld('master_vectors')
    tag_lists = []
    for vector_list in master_vectors:
        with gzip.open(vector_list) as f:
            js = json.load(f)
            hashtags = js.get('hashtags')
            if len(hashtags) > 2:
                tag_lists.append(set(hashtags))
    return tag_lists
Пример #4
0
def fix_all(directory):
    exclude = [os.path.join(directory, fil) for fil in os.listdir(OUT_DIR)]

    def fil_is_valid(fil):
        return fil not in exclude and bool(re.match(r'[\w]+\.json', os.path.split(fil)[-1]))

    fils = filter(fil_is_valid, eld(directory))
    num_fils = len(fils)
    for idx, fil in enumerate(fils):
        print "fixing {} ({} of {})".format(fil, idx, num_fils)
        fix_js(fil)
Пример #5
0
def master_nonvectorize():
    """
    Equivalent of master_vectorize for get_master_nonvectors: calls get_master_nonvectors
    for all files in tag_values
    :return: None (writes to master_values)
    """
    outdir = "master_values"
    if not exists(outdir):
        mkdir(outdir)
    clustered_values = eld("tag_values")
    for fil in clustered_values:
        get_master_nonvectors(fil, join(outdir, split(fil)[1]))
Пример #6
0
def master_nonvectorize():
    """
    Equivalent of master_vectorize for get_master_nonvectors: calls get_master_nonvectors
    for all files in tag_values
    :return: None (writes to master_values)
    """
    outdir = "master_values"
    if not exists(outdir):
        mkdir(outdir)
    clustered_values = eld("tag_values")
    for fil in clustered_values:
        get_master_nonvectors(fil, join(outdir, split(fil)[1]))
Пример #7
0
def extract_folder(val, folder, outfile, user=False):
    """
    Calls extractor on every file in a folder
    :param val: the value to extract
    :type val: string
    :param folder: the path to the folder
    :type folder: string (path to folder)
    :param outfile: the file to write to
    :type outfile: string (file name)
    :return: None (writes to outfile)
    """
    for f in eld(folder):
        extractor(val, f, outfile, user)
Пример #8
0
def fix_all(directory):
    exclude = [os.path.join(directory, fil) for fil in os.listdir(OUT_DIR)]

    def fil_is_valid(fil):
        return fil not in exclude and bool(
            re.match(r'[\w]+\.json',
                     os.path.split(fil)[-1]))

    fils = filter(fil_is_valid, eld(directory))
    num_fils = len(fils)
    for idx, fil in enumerate(fils):
        print "fixing {} ({} of {})".format(fil, idx, num_fils)
        fix_js(fil)
Пример #9
0
def process_all(outfile='arff_data.txt', dry_run=False):
    fils = eld('master_vectors')
    s = ''
    for fil in fils:
        entry = process_entry(fil)
        if not entry:
            print """WARNING: No corresponding value file found for {fil}""".format(fil=fil)
        else:
            s += process_entry(fil)
    if dry_run:
        print s
    else:
        with open(outfile, 'w') as f:
            f.write(s)
Пример #10
0
def non_vectorize(start, stop):
    """
    vectorize equivalent for non-vector values
    :return: None
    """
    extracted = eld("extracted2")[start:stop]
    outdir = "tag_values"
    if not exists(outdir):
        mkdir(outdir)
    for fil in extracted:
        try:
            group_non_vectors(fil, outdir)
        except (TypeError, ValueError, OverflowError):
            continue
Пример #11
0
def non_vectorize(start, stop):
    """
    vectorize equivalent for non-vector values
    :return: None
    """
    extracted = eld("extracted2")[start:stop]
    outdir = "tag_values"
    if not exists(outdir):
        mkdir(outdir)
    for fil in extracted:
        try:
            group_non_vectors(fil, outdir)
        except (TypeError, ValueError, OverflowError):
            continue
Пример #12
0
 def __init__(self, fnames=None, folder=None):
     """
     :param fnames: list of JSON files to process
     :type fnames: list of strings (file names)
     :param folder: path to a folder containing JSON files to process
     :type folder: string
     """
     self.fnames = []
     if fnames:
         self.fnames += fnames
     if folder:
         self.fnames += eld(folder)
     self.domain_pat = re.compile(r"https?://([\w\d\.\-]+\.\w{2,3})")
     self.clean_pat = re.compile(r"(@|#|http)\S+")
     self.parsed = []
     self.tweets = []
Пример #13
0
 def __init__(self, fnames=None, folder=None):
     """
     :param fnames: list of JSON files to process
     :type fnames: list of strings (file names)
     :param folder: path to a folder containing JSON files to process
     :type folder: string
     """
     self.fnames = []
     if fnames:
         self.fnames += fnames
     if folder:
         self.fnames += eld(folder)
     self.domain_pat = re.compile(r'https?://([\w\d\.\-]+\.\w{2,3})')
     self.clean_pat = re.compile(r'(@|#|http)\S+')
     self.parsed = []
     self.tweets = []
Пример #14
0
def process_dir(directory, use_testdb=False, show_sql=True):
    """
    Process a whole directory.
    :param directory: the directory to process
    :type directory: str (path to a directory)
    :param test: whether to make the Inserter instance use the test db/cache settings
    :param test: bool
    :return: None
    """
    inserter = Inserter(use_testdb, show_sql)
    try:
        fils = eld(directory)
        for fil in fils:
            process_fil(fil, inserter)
            print "Processed {fil}".format(fil=fil)
    finally:
        inserter.DB.commit()
Пример #15
0
def process_dir(directory, use_testdb=False, show_sql=True):
    """
    Process a whole directory.
    :param directory: the directory to process
    :type directory: str (path to a directory)
    :param test: whether to make the Inserter instance use the test db/cache settings
    :param test: bool
    :return: None
    """
    inserter = Inserter(use_testdb, show_sql)
    try:
        fils = eld(directory)
        for fil in fils:
            process_fil(fil, inserter)
            print "Processed {fil}".format(fil=fil)
    finally:
        inserter.DB.commit()