示例#1
0
def read_data():
    _vocab = load_list(FLAGS.data_dir, '.vocab')
    _labels = load_list(FLAGS.data_dir, '.labels')

    if FLAGS.format == 'pkl':
        with open(FLAGS.input, 'rb') as f:
            data = pickle.load(f)
    elif FLAGS.format == 'iob':
        with open(FLAGS.input) as f:
            length = 0
            _length = 0
            for line in f:
                if line == '\n' and length > 0:
                    _length = max(_length, length)
                    length = 0
                else:
                    length += 1
        if _length > FLAGS.num_steps:
            raise ValueError('Max sequence length %i > num_steps %i' %
                             (_length, FLAGS.num_steps))
        else:
            _length = FLAGS.num_steps

        with open(FLAGS.input) as f:
            data = {'tokens': [], 'labels': [], 'lengths': [], 'weights': []}
            tokens = []
            labels = []
            for line in f:
                if line == '\n' and tokens and labels:
                    length = len(tokens)
                    weights = [1] * len(tokens) + [0] * (_length - length)

                    tokens += [0] * (_length - length)
                    labels += [0] * (_length - length)

                    data['tokens'].append(tokens)
                    data['labels'].append(labels)
                    data['lengths'].append(length)
                    data['weights'].append(weights)

                    tokens = []
                    labels = []
                else:
                    token, label = line.split()
                    if token in _vocab:
                        tokens.append(_vocab.index(token))
                    else:
                        tokens.append(1)

                    labels.append(_labels.index(label))
    elif FLAGS.format == 'txt':
        raise ValueError('Format not supported yet.')
    else:
        raise ValueError('Unknown file format %s.' % FLAGS.format)
    return _vocab, _labels, data
示例#2
0
    def read_part_of_posting(self,
                             posting,
                             num_of_file,
                             last_file=False,
                             first_read=False):
        """gets a posting NAME and it's index!! and reads it's content from the disk
           store the file descriptor of current posting file"""
        num_of_file += 1  # this gives values of 1..* to file names, skipping 0

        with open(posting, 'rb') as pickle_in:
            # pickle_in = open("{}".format(posting), "rb")
            if num_of_file in self.file_descriptor_dict:
                fdr = self.file_descriptor_dict[num_of_file]
                pickle_in.seek(fdr)
            part_of_posting = []

            if int(Indexer.NUM_OF_TERMS_IN_POSTINGS /
                   Indexer.PICKLE_COUNTER) > 0:
                amount_to_read = int(Indexer.NUM_OF_TERMS_IN_POSTINGS /
                                     Indexer.PICKLE_COUNTER)
            else:
                amount_to_read = Indexer.NUM_OF_TERMS_IN_POSTINGS
            if last_file:
                amount_to_read = Indexer.NUM_OF_TERMS_IN_POSTINGS

            # amount_to_read = 2325
            if first_read:
                for i in range(amount_to_read):
                    try:
                        key_value = utils.load_list(pickle_in)
                        # key_value = pickle.load(pickle_in)
                        part_of_posting.append(key_value)
                    except:
                        break
            else:
                for i in range(amount_to_read):
                    try:
                        key_value = utils.load_list(pickle_in)
                        # key_value = pickle.load(pickle_in)
                        part_of_posting.append(key_value)
                        self.values_size += len(key_value[1])

                        if self.values_size >= 2000000:
                            self.values_size = 0
                            break
                    except:
                        break

            self.file_descriptor_dict[num_of_file] = pickle_in.tell()
        # pickle_in.close()

        return part_of_posting
示例#3
0
def read_clust_output(fname, max_pval):
    lines = ut.load_list(fname)[1:]
    cxs = [set(line.split(',')[7].strip('"').split()) for line in lines]
    pvals = [line.split(',')[6] for line in lines]
    details = [','.join(line.split(',')[:7]) for line in lines]
    print "Retaining complexes with p < %0.2f." % max_pval
    cxs,pvals,details = keep_pvals(cxs,pvals,details, max_pval)
    return cxs,pvals,details
示例#4
0
def load_prots_from_fasta(fname):
    """
    Files are in ut.config()['fastadir'].
    Returns a set since usually I'm searching against it.
    """
    protlines = [l for l in ut.load_list(fname) if l[0]=='>']
    genes = set([l.split(' ')[0].strip('>') for l in protlines])
    return genes
示例#5
0
def load_prots_from_fasta_dep(fname):
    """
    Files are in ut.config()['fastadir'].
    All so far can be split by both space and |.
    Returns a set since usually I'm searching against it.
    """
    protlines = [l[1:] for l in ut.load_list(fname) if l[0]=='>']
    prots = set([l.split(' ')[0].split('|')[0] for l in protlines])
    return prots
示例#6
0
def _load_prots_to_lol(fname):
    prots = ut.load_list(fname)
    prots_clean = []
    for line in prots:
        if line[0] == '>': 
            prots_clean.append([line])
        else:
            prots_clean[-1].append(line)
    return prots_clean
示例#7
0
 def load_index(self, fn):
     """
     Loads a pre-computed index (or indices) so we can answer queries.
     Input:
         fn - file name of pickled index.
     """
     listObj = utils.load_list(fn)
     self.inverted_idx = listObj[0]
     self.postingDict = listObj[1]
     self.documents = listObj[2]
def multi_identities(input_fname, out_dir):
    input_list = ut.load_lol(input_fname)
    for desc, prots_fname, source_fasta, odict, target in input_list:
        print "%s, proteins: %s\n source: %s\n odict: %s\ntarget: %s" % (desc,
                prots_fname, source_fasta, odict, target)
        prots = ut.load_list(prots_fname)
        sims = all_identities(prots, odict, source_fasta, target)
        out_fname = os.path.join(out_dir,
                ut.shortname(target).split('.')[0] + "_" + desc + ".txt")
        ut.write_tab_file(sims, out_fname, islist=True)
示例#9
0
def figures(recon_fname, exclude_recon_fname, kegg_fname, all_ppis_fname, 
        recon_pairs=None, do_plot_cdf=False, return_pairs=False):
    rpairs = recon_pairs or load_seq_pairs(recon_fname,
            ut.load_list(exclude_recon_fname))
    kpairs = load_kegg_sequentials(kegg_fname)
    pdk = pd.PairDict(kpairs)
    intpairs = [p for p in rpairs if pdk.contains(p)]
    ppis = pu.load_ppis(all_ppis_fname) 
    plot_pairs_randoms_etc(intpairs, ppis)
    if do_plot_cdf:
        plot_cdf_pos_randoms(intpairs, ppis)
    if return_pairs:
        return intpairs
示例#10
0
def prots2genes(fname):
    """
    If there's only one item in the first line, just return a dummy dict
    mapping each id to itself.
    Otherwise, assume the line begins with >GENEID and ends with
    protein:PROTEINID.
    """
    lines = [l for l in ut.load_list(fname) if len(l)>0 and l[0]=='>']
    if len(lines[0].split())==1:
        return dict([(g,g) for g in [l.strip('>') for l in lines]])
    elif len(lines[0].split(':'))==1:
        # Xl
        return dict([(g,g) for g in [l.split()[0].strip('>') for l in lines]])
    else:
        return dict([(p.split()[-1].split(':')[1], p.split()[0].strip('>'))
                    for p in lines])
示例#11
0
    def merge_chunks(self):
        """
        performs a K-way merge on the posting files -> N disk accesses
        writes new posting files to the disk.
        :return:
        """
        saved_chunks = []
        chunks_indices = np.zeros(shape=(len(self.locations_at_postings)),
                                  dtype=np.int32)
        chunk_length = self.postingDict_size // len(
            self.locations_at_postings) + 1
        #   inserts the chunks into a chunked list
        for key in self.locations_at_postings:
            loaded, offset = utils.load_list(key, self.config.get_out_path(),
                                             self.locations_at_postings[key],
                                             chunk_length)
            saved_chunks.append(loaded)
            self.locations_at_postings[key] = offset

        building_list = []
        all_empty = True

        # loops through as long as all postings files didn't finish running.
        while all_empty:
            should_enter = -1

            # loops through as long as one of the chunks is not done.
            while should_enter == -1:
                term_to_enter = self.find_term(saved_chunks, chunks_indices)
                tuples_to_merge = []
                indexes_of_the_indexes_to_increase = []

                # find all tuples that should be merged and the indices should be increased
                for idx, term_idx_in_chunk in enumerate(chunks_indices):
                    if term_idx_in_chunk < len(saved_chunks[idx]) and \
                            saved_chunks[idx][term_idx_in_chunk][0] == term_to_enter:
                        tuples_to_merge.append(
                            saved_chunks[idx][term_idx_in_chunk])
                        indexes_of_the_indexes_to_increase.append(idx)

                merged_tuple = self.merge_terms_into_one(tuples_to_merge)
                appended_term = merged_tuple[0]
                should_append = True
                # if it is a named entity and it exists in less than 2 tweets, erase this term.
                if appended_term in self.entities_dict and self.entities_dict[
                        appended_term] < 2:
                    should_append = False
                    self.inverted_idx.pop(appended_term, None)
                # update terms with capital letters
                if appended_term in self.global_capitals and self.global_capitals[
                        appended_term]:
                    merged_tuple = (appended_term.upper(), merged_tuple[1])
                    inverted_val = self.inverted_idx[appended_term]
                    self.inverted_idx.pop(appended_term, None)
                    self.inverted_idx[appended_term.upper()] = inverted_val
                appended_term = merged_tuple[0]
                if appended_term in self.inverted_idx and self.inverted_idx[
                        appended_term][0] == 1:
                    should_append = False
                    self.inverted_idx.pop(appended_term, None)
                if should_append:
                    self.accumulative_size += len(merged_tuple[1])
                    building_list.append(merged_tuple)
                    self.inverted_idx[merged_tuple[0]][1] = str(
                        self.counter_of_postings)

                # increase the indices that the tuple at the specific location have been inserted to the new posting
                for idx in indexes_of_the_indexes_to_increase:
                    chunks_indices[idx] += 1

                should_enter = self.update_should_enter(
                    saved_chunks, chunks_indices)

                # saving happens as soon as the size reaches given max size of the final posting
                if self.accumulative_size >= self.max_accumulative:
                    self.merged_dicts.append(str(self.counter_of_postings))
                    utils.save_list(building_list,
                                    str(self.counter_of_postings),
                                    self.config.get_out_path())
                    self.accumulative_size = 0
                    self.counter_of_postings += 1
                    building_list = []
            # loads new chunks into the save_chunks list in the relevant indices.
            for index in should_enter:
                loaded, offset = utils.load_list(
                    str(index), self.config.get_out_path(),
                    self.locations_at_postings[str(index)], chunk_length)
                saved_chunks[index] = loaded
                chunks_indices[index] = 0
                self.locations_at_postings[str(index)] = offset

            # checks whether all postings are done.
            all_empty = False
            for chunk in saved_chunks:
                if len(chunk) > 0:
                    all_empty = True
                    break

        # save of the last posting file.
        if len(building_list) > 0:
            self.merged_dicts.append(str(self.counter_of_postings))
            utils.save_list(building_list, str(self.counter_of_postings),
                            self.config.get_out_path())
示例#12
0
import os, re
from utils import load_list, write_list, printProgressBar

sites = ['gamepedia', 'LeagueFandom', 'mobafire']

# get the list of all full text files for all sites
files = list()
for site in sites:
    folder = os.path.join("data", site + "Data")
    filelist = os.listdir(folder)
    files = files + [
        os.path.join(folder, b) for b in filelist if "fullText" in b
    ]

# get the words to count
terms = load_list("data/filteredTerms.txt")
urls = list()
counts = np.zeros([len(files), len(terms)])
countFilename = "data/counts.txt"

removeText = load_list('removeText.txt')

# Write the first line of the counts filename, which is terms
with open(countFilename, 'w+', encoding='utf-8', errors='ignore') as f:
    f.write("site\t")
    for t in terms:
        f.write(t + "\t")
    f.write("\n")

# main loop
for idx, file in enumerate(files):
示例#13
0
        edge.target]["community"]
    if is_delete:
        r = random.random()
        w = edge["weight"]
        if f(w) < r:
            is_delete = False

    return is_delete


if __name__ == "__main__":
    pass
    start_time = time.time()

    path = "./Dataset/category_input"
    selected_categories = utils.load_list(path)
    print("Selected {} categories : {}".format(len(selected_categories),
                                               selected_categories))

    tags = []
    map_pair_tag_occ = {}
    for i, cat in enumerate(selected_categories):
        print("{}/{} ...".format(i + 1, len(selected_categories)))
        path = "./Dataset/refined_data/Pair_Tag/{}.csv".format(cat)
        df = utils.load_csv(path)
        for _, row in df.iterrows():
            tag1, tag2, occ = "_{}".format(row["Tag1"]), "_{}".format(
                row["Tag2"]), int(row["Num_Occurrence"])
            tags.extend([tag1, tag2])
            old_occ = map_pair_tag_occ.get((tag1, tag2), 0)
            map_pair_tag_occ.update({(tag1, tag2): old_occ + occ})
示例#14
0
import utils
import re
files = ["data/mobafire_CompletedPagesList.txt"]
pages = utils.load_list(files[0])
terms = set()
for p in pages:
    words = p.split('/')
    for w in words:
        if '-' in w:
            # replace '-' with ' '
            w = w.split('-')
            # remove the last part if it is a number or "guide"
            if w[-1] == 'guide' or w[-1].isnumeric():
                w = w[:-1]
            w = " ".join(w)
        terms.add(w.lower().strip())
mobafireTermsList = list(terms)
mobafireTermsList.sort()

file = "data/LeagueFandom_CompletedPagesList.txt"
pages = utils.load_list(file)
terms = set()
termsToReplace = [('_', ' '), ('%27', '\''), ('(item)', ''), ('%26', '&')]
for p in pages:
    if "Teamfight_Tactics" in p:
        continue
    words = p.split('/')
    for w in words:

        for t in termsToReplace:
            w = w.replace(t[0], t[1])
# making it easier to load and view the words of most interest.
# The number of words in each file can be set via command line (-b or --batch argument), default to 500 words

import numpy as np
import pandas as pd
from utils import load_list, write_list, printProgressBar
import argparse

# command line argument for how to split the words
parser = argparse.ArgumentParser()
parser.add_argument('-b', '--batch', type=int, default=500, action='store')
args = parser.parse_args()

# load terms and urls
print("loading files...")
terms = load_list("data/filteredTerms.txt")
urls = load_list("data/urls.txt")
counts = np.zeros([len(urls) + 1, len(terms)])
with open("data/counts.txt", 'r', encoding='utf-8', errors='ignore') as f:
    idx = 0
    l = f.readline()
    l = f.readline()
    while l != "":
        #clear_output(wait=True)
        printProgressBar(idx,
                         len(urls) - 1,
                         length=25,
                         suffix=urls[idx] + " " * (100 - len(urls[idx])))
        #print(idx,"of",len(urls)-1,urls[idx])
        counts[idx, :] = l.split("\t")[1:-1]
        l = f.readline()
示例#16
0
import numpy as np
from matplotlib import pyplot as plt
from utils import load_list

loss = load_list(r'data/loss.txt')
loss = np.array([float(x) for x in loss])

acc = load_list(r'data/acc.txt')
acc = np.array([float(x) for x in acc])

plt.plot(range(len(loss)), loss)
plt.plot(range(len(acc)), acc)

plt.show()
示例#17
0
import utils
import nltk

filteredTerms = utils.load_list('data/filteredTerms.txt')
filteredTerms.sort()

dupeList = dict()

for idx, t in enumerate(list(filteredTerms)):
    surroundingWords = filteredTerms[idx-3:idx+3]
    l = [a for a in surroundingWords if t!=a and nltk.edit_distance(t,a) <= 1]
    if len(l) > 0:
        dupeList[t] = l

for d in dupeList:
    print(d, dupeList[d])





keys = list(dupeList.keys())

dupeList['attack']
ing = [word for word in filteredTerms if 'ing' in word]
sWords = [word for word in filteredTerms if 's' == word[-1] and '\''!= word[-2]]



nltk.edit_distance('ability','abilities')
示例#18
0
            ]

# Go through remove list
removeList = list()
for r in badList:
    print(r)
    resp = requests.get(r)
    strainer = SoupStrainer(class_=['category-page__member'])
    soup = BeautifulSoup(resp.content,'lxml',parse_only=strainer)
    l = list()
    #soup.find('div', id="top-schedule").decompose()
    for t in soup.find_all("a"):
        if 'href' in t.attrs and "http" not in t['href'] and "#" not in t['href'] and "action=edit" not in t['href']:
            link = t['href']
            # remove '/wiki/' from beginning of link
            link = link.replace("/wiki/","")
            l.append(link)

    for a in l:
        removeList.append(a)
    time.sleep(1)

removeList = list(set(removeList))
baseList = utils.load_list("data/LeagueFandom.txt")

# Go through every item in removeList and remove it from baseList
for w in removeList:
    baseList = [b for b in baseList if b!=w]

utils.write_list(baseList,'data/LeagueFandom.txt')
示例#19
0
# removes other errors in terms
# finally removes duplicates
# author: Zack Wisti

stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
			 "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself",
			 "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which",
			 "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be",
			 "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
			 "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for",
			 "with", "about", "against", "between", "into", "through", "during", "before", "after", "above",
			 "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further",
			 "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each",
			 "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
			 "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
championList = load_list('championList.txt')
championList = [b.lower() for b in championList]
itemList = load_list('itemList.txt')
itemList = [b.lower() for b in itemList]
def filterTerms(terms,banlist=None):
	terms = [b.lower() for b in terms]
	# get the stem from any thing that is directory format
	bonus = list()
	for b in terms:
		if "/" in b:
			bonus += b.split("/")
	terms += bonus		
	terms = [b for b in terms if not "/" in b]
	
	# replace _ with a space
	terms = [re.sub("_"," ", b) for b in terms]