Exemplo n.º 1
0
def build_data(fnamex, fnamey, num_layers, max_seq_len):
    fdx, fdy = open(fnamex), open(fnamey)
    x_token_list = []
    y_token_list = []

    # we need to fill in the entire dataset
    linex, liney = fdx.readline(), fdy.readline()

    while linex and liney:
        x_tokens, y_tokens = util.tokenize(linex), util.tokenize(liney)

        # this is not truncating...just ignoring
        if len(x_tokens) < max_seq_len and len(y_tokens) < max_seq_len:
            x_token_list.append(x_tokens)
            y_token_list.append(y_tokens)

        linex, liney = fdx.readline(), fdy.readline()

    y_token_list = add_sos_eos(y_token_list)  # shift y by 1 position
    x_padded, y_padded = padded(x_token_list, num_layers), padded(y_token_list, 1)

    source_tokens = np.array(x_padded).T
    source_mask = (source_tokens != PAD_ID).astype(np.int32)
    target_tokens = np.array(y_padded).T
    target_mask = (target_tokens != PAD_ID).astype(np.int32)

    return source_tokens, source_mask, target_tokens, target_mask
Exemplo n.º 2
0
def get_similarity(in_phrase, words, embeddings):
    # word embeddings is assumed to be a dict of {word: embedding} pairs of type {string: np array}
    # words is assumed to be a list of lists of form [word, freq], types - [str, int]
    # if it is a string then it is converted to the [[w1,f1],[w2,f2],...] form

    #	words = ast.literal_eval(words)
    if isinstance(words, str):
        words = util.word_freq(
            util.tokenize(words))  # -----> tokenizing takes out NEs by default

    a = np.zeros(
        len(embeddings['example']
            ))  # initialize vector to same dimension as preloaded embeddings
    for word in util.tokenize(in_phrase):
        a = a + embeddings[
            word]  # -----> this should also be weighted more by thing than category

    a = a / np.sqrt(np.dot(a, a))  # -----> normalize

    b = np.zeros(len(embeddings['example']))
    for [word, freq] in words:
        if word in embeddings.keys():
            b = b + freq * embeddings[word]  # weights

    b = b / np.sqrt(np.dot(b, b))  # -----> normalize

    similarity = np.dot(a, b)  # normalized weighted inner product by frequency
    return similarity
Exemplo n.º 3
0
def get_distance(in_phrase, words, embeddings):
    # word embeddings is assumed to be a dict of {word: embedding} pairs of type {string: np array}
    # words is assumed to be a list of lists of form [word, freq], types - [str, int]
    # if it is a string then it is converted to the [[w1,f1],[w2,f2],...] form

    #	words = ast.literal_eval(words)
    if isinstance(words, str):
        words = util.word_freq(
            util.tokenize(words))  # -----> tokenizing takes out NEs by default
    a = np.zeros(
        len(embeddings['example']
            ))  # initialize vector to same dimension as preloaded embeddings
    for word in util.tokenize(in_phrase):
        a = a + embeddings[word]
    a = a / np.sqrt(np.dot(a, a))
    b = np.zeros(len(embeddings['example']))
    for [word, freq] in words:
        if word in embeddings.keys():
            b = b + freq * embeddings[word]  # weights
    b = b / np.sqrt(np.dot(b, b))

    distance = np.sqrt(np.sum(
        np.square(a - b)))  # euclidean distance between weighted vectors
    distance = (
        2 - distance
    ) / 2  # -------> transform so that more relevant targets get a higher score from this function
    return distance
Exemplo n.º 4
0
def get_similarities():
    global model

    check_model()

    comparison = int(flask.request.args.get("article"))
    ctoken = util.tokenize([util.fetch(comparison)[1]])

    recent_articles = util.fetch_top_100(comparison)
    recent_article_ids = [a[0] for a in recent_articles]

    founds, not_founds = util.fetch_with(comparison, recent_article_ids)
    result_ids = [f[0] for f in founds]

    similarities = [f[1] for f in founds]

    not_found_articles = [a for a in recent_articles if a[0] in not_founds]

    for a in not_found_articles:
        sim = model.docvecs.similarity_unseen_docs(model, ctoken[0], util.tokenize([a[1]])[0])
        util.insert_similarity(comparison, a[0], sim.item())
        result_ids.append(a[0])
        similarities.append(str(sim))

    return flask.jsonify(
        result="ok",
        articleIds=result_ids,
        similarities=similarities
    )
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser("Creates a result file indicating the sentence-difference baseline.")
    parser.add_argument("--vectorspace", "-v", help="Vector space file.")
    parser.add_argument("--sentences", "-s", help="The appropriate .txt file in resources.")
    parser.add_argument("--vectormaker", choices=("add", "mul"), default="add", help="Use vector addition or multiplication.")
    args = parser.parse_args()

    if args.vectormaker == "add":
        combiner = operator.add
    elif args.vectormaker == "mul":
        combiner = operator.mul
    else:
        sys.stderr.write("Don't know what to do with vectormaker '%s'." % args.vectormaker)
        sys.exit(1)

    sentence_pairs = [l.strip().lower().split("\t") for l in open(args.sentences)]
    tokenized_sentences = [(tokenize(left), tokenize(right)) for left, right in sentence_pairs]

    vectorspace = load_vectorspace(args.vectorspace)

    sys.stdout.write("Errors found: 0\n")
    sys.stdout.write("[")
    for left_sent, right_sent in tokenized_sentences:
        left_pieces, right_pieces = find_difference(left_sent, right_sent)
        features = extract_features(left_pieces, right_pieces, vectorspace, combiner)
        sys.stdout.write(" " + ",".join(map(str, features)))
    sys.stdout.write("]\n")
Exemplo n.º 6
0
def train1(english, french, trans_prob, loop_count=20):

    for i in range(loop_count):

        print("Running Iteration..", i + 1)

        count = collections.defaultdict(float)
        total = collections.defaultdict(float)
        count = collections.defaultdict(float)

        sum_total = {}

        for (english, french) in zip(english, french):

            english = tokenize(english)
            french = tokenize(french)
            for e in english:
                sum_total[e] = 0.0
                for f in french:
                    sum_total[e] += trans_prob[(e, f)]

            for e in english:
                for f in french:
                    count[(e, f)] += trans_prob[(e, f)] / sum_total[e]
                    total[f] += trans_prob[(e, f)] / sum_total[e]

        for (e, f) in count.keys():
            trans_prob[(e, f)] = count[(e, f)] / total[f]

        pickle.dump(trans_prob, \
           open('../OutputFiles/map1_'+str(size)+'_'+str(i+1)+'.pickle','wb'))

    return trans_prob
Exemplo n.º 7
0
def tokenize_qa(qa, context):
    tokenized_qa = list()
    a = qa["answers"][0]
    question_answer = dict()
    question_answer["question"] = tokenize(qa["question"])
    question_answer["answer_begin"], question_answer["answer_end"] = \
        find_subtext(context, tokenize(a["text"]))
    return question_answer
Exemplo n.º 8
0
def initialize(options):
    if not OPENCL:
        options.no_ocl = True
        return []

    options.worksize = tokenize(options.worksize, 'worksize')
    options.frames = tokenize(options.frames, 'frames', [30])
    options.frameSleep = tokenize(options.frameSleep, 'frameSleep', cast=float)
    options.vectors = [True] if options.old_vectors else tokenize(
        options.vectors, 'vectors', [False], bool)

    platforms = cl.get_platforms()

    if options.platform >= len(platforms) or (options.platform == -1
                                              and len(platforms) > 1):
        print(
            'Wrong platform or more than one OpenCL platforms found, use --platform to select one of the following\n'
        )
        for i in range(len(platforms)):
            print('[%d]\t%s' % (i, platforms[i].name))
        sys.exit()

    if options.platform == -1:
        options.platform = 0

    devices = platforms[options.platform].get_devices()

    if not options.device and devices:
        print('\nOpenCL devices:\n')
        for i in range(len(devices)):
            print('[%d]\t%s' % (i, devices[i].name))
        print('\nNo devices specified, using all GPU devices\n')

    miners = [
        OpenCLMiner(i, options) for i in range(len(devices))
        if ((not options.device and devices[i].type == cl.device_type.GPU) or (
            i in options.device))
    ]

    for i in range(len(miners)):
        miners[i].worksize = options.worksize[min(i,
                                                  len(options.worksize) - 1)]
        miners[i].frames = options.frames[min(i, len(options.frames) - 1)]
        miners[i].frameSleep = options.frameSleep[min(
            i,
            len(options.frameSleep) - 1)]
        miners[i].vectors = options.vectors[min(i, len(options.vectors) - 1)]
        miners[i].cutoff_temp = options.cutoff_temp[min(
            i,
            len(options.cutoff_temp) - 1)]
        miners[i].cutoff_interval = options.cutoff_interval[min(
            i,
            len(options.cutoff_interval) - 1)]
    return miners
def main():
    parser = argparse.ArgumentParser("Creates a result file by counting occurrences of key phrases.")
    parser.add_argument("--sentences", "-s", help="The appropriate .txt file in resources.")
    args = parser.parse_args()

    sentence_pairs = [l.strip().lower().split("\t") for l in open(args.sentences)]
    tokenized_sentences = [(tokenize(left), tokenize(right)) for left, right in sentence_pairs]

    sys.stdout.write("Errors found: 0\n")
    sys.stdout.write("[")
    for left_sent, right_sent in tokenized_sentences:
        features = extract_features(left_sent, right_sent)
        sys.stdout.write(" " + ",".join(map(str, features)))
    sys.stdout.write("]\n")
Exemplo n.º 10
0
def compute_inverted_index(coll_folder, stemming, output_file_path_ii):
    if not os.path.isfile(output_file_path_ii):
        print('computing inverted index')
        inverted_idx = {}
        sw = util.load_indri_stopwords()
        doc_n = 0
        for filename in tqdm(os.listdir(coll_folder)):
            fp = os.path.join(coll_folder, filename)
            doc_id = filename.split(r'.')[0]
            if os.path.isfile(fp):
                doc_n += 1
                d = util.tokenize(' '.join(open(fp, 'r').readlines()),
                                  stemming,
                                  stoplist=sw)
                set_w_in_doc = set(d)
                for w in set_w_in_doc:
                    if w in inverted_idx.keys():
                        inverted_idx[w].append((doc_id, d.count(w)))
                    else:
                        inverted_idx[w] = [(doc_id, d.count(w))]

        util.save_model(inverted_idx, output_file_path_ii)
    else:
        inverted_idx = util.load_model(output_file_path_ii)
    return inverted_idx
Exemplo n.º 11
0
def encode_collection(text_by_name_p, word_dict_path, encoded_out_folder):
    # word_dict_path = '/media/alberto/DATA/BaiduNetdiskDownload/data/word_dict.txt'
    text_by_name = {}
    print('reading files in folder')
    for filename in tqdm(os.listdir(text_by_name_p)):
        fp = os.path.join(text_by_name_p, filename)
        if os.path.isfile(fp):
            text_by_name[filename.split(r'.')[0]] = ' '.join(
                open(fp, 'r').readlines())
    print('reading word2vec model')
    encoded_docs_by_name = {}
    wi = {}
    for line in tqdm(open(word_dict_path)):
        data = line.split()
        word = data[0].strip()
        wid = int(data[1].strip())
        if word not in wi.keys():
            wi[word] = wid
    sw = load_indri_stopwords()
    print('encoding data')
    for dn, dc in tqdm(text_by_name.items()):
        td = util.tokenize(dc, stemming=False, stoplist=sw)
        encoded_doc = [wi[w] for w in td if w in wi.keys()]
        util.save_model(encoded_doc, os.path.join(encoded_out_folder, dn))
        encoded_docs_by_name[dn] = encoded_doc
    return encoded_docs_by_name
Exemplo n.º 12
0
def test_tokens_expr():
    src, names = expr_fix
    assert list(util.tokenize(src)) == [
        TokenInfo(type=2, string='2', start=(1, 0), end=(1, 1), line='2 + 3'),
        TokenInfo(type=14, string='+', start=(1, 2), end=(1, 3), line='2 + 3'),
        TokenInfo(type=2, string='3', start=(1, 4), end=(1, 5), line='2 + 3'),
        TokenInfo(type=0, string='', start=(2, 0), end=(2, 0), line='')]
Exemplo n.º 13
0
def tokenize_data(X):
    ''' Stores token lists of questions and response in dataframe X'''
    X['t_q'] = X['Question Text'].apply(tokenize)
    X['t_r'] = X['Response'].apply(tokenize)
    X['t_r_f'] = X['Response'].apply(lambda x: tokenize(first_sentence(x)))

    return X
Exemplo n.º 14
0
def get_we_matrix_wi_encode_docs_w_fasttext(ftext_model_path,
                                            docs_text_main_folder,
                                            encoded_out_folder_docs):
    f = load_model(ftext_model_path)
    text_by_name = {}
    print('reading files in folder')
    for filename in tqdm(os.listdir(docs_text_main_folder)):
        fp = os.path.join(docs_text_main_folder, filename)
        if os.path.isfile(fp):
            text_by_name[filename.split(r'.')[0]] = ' '.join(
                open(fp, 'r').readlines())
    stoplist = load_indri_stopwords()

    print('encoding collection')
    encoded_docs_by_name = {}
    wi = {}
    we_matrix = []
    for dn, dt in tqdm(text_by_name.items()):
        tok_doc = util.tokenize(dt, stemming=False, stoplist=stoplist)
        encoded_doc = []
        for tok in tok_doc:
            if tok not in wi.keys():
                wv = f.get_word_vector(tok)
                wi[tok] = len(wi)
                we_matrix.append(wv)
            encoded_doc.append(wi[tok])
        util.save_model(encoded_doc, os.path.join(encoded_out_folder_docs, dn))
        encoded_docs_by_name[dn] = encoded_doc
    return encoded_docs_by_name, wi, we_matrix
Exemplo n.º 15
0
def extract_emoticon(text):
    emoticons = []
    for token, tag in tag_tweets(tokenize(text)):
        if tag == "emoticon":
            emoticons.append(token)
#            text = text.replace(token, "")
    return emoticons, text
Exemplo n.º 16
0
def make_concepts_baseline(id, path, sents, query):
    """
    only use first sentences
    TODO: choose best of first 3
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    max_order = 0
    for sent in sents:
        
        ## store this sentence's concepts
        sent.concepts = set([])
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) > 0:
            for concept in concepts:
                all_concepts[concept].add(sent.doc)

            if sent.order == 0:
                for concept in concepts:
                    all_concepts[concept].add(sent.doc + 'first')

        ## ignore some sents
        if sent.order == 0: max_order = 0
        skip = False
        if sent.length <= 5: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.length < 20: skip = True
        if sent.order > max_order or max_order > 0: 
            skip = True
            max_order = 0
        
        if skip: 
            max_order += 1
            continue
        
        #print sent.order, max_order, sent.doc, sent
        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        #if count < 3: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)
        
    return create_ilp_output(sents, final_concepts, path+id)
Exemplo n.º 17
0
 def indexDoc(self, doc):  # indexing a Document object
     """ indexing a docuemnt, using the simple SPIMI algorithm, but no need to store blocks due to the small collection we are handling. Using save/load the whole index instead"""
     self.nDocs += 1
     tokens = util.tokenize(doc.title + "\n" + doc.body)
     for i, token in enumerate(tokens):
         if token not in self.items:
             self.items[token] = IndexItem(token)
         self.items[token].add(doc.docID, i)
Exemplo n.º 18
0
def test_tokens_expr():
    src, names = expr_fix
    assert list(util.tokenize(src)) == [
        TokenInfo(type=2, string='2', start=(1, 0), end=(1, 1), line='2 + 3'),
        TokenInfo(type=14, string='+', start=(1, 2), end=(1, 3), line='2 + 3'),
        TokenInfo(type=2, string='3', start=(1, 4), end=(1, 5), line='2 + 3'),
        TokenInfo(type=0, string='', start=(2, 0), end=(2, 0), line='')
    ]
Exemplo n.º 19
0
def process_input(tweet):
    """
    Process input data to tokenize and make labels.
    """
    tokens = tokenize(to_ascii(tweet.text))
    label = LABELS.index(tweet.topic)
    assert label >= 0
    return tweet.id, tokens, label
Exemplo n.º 20
0
def give_loc_tf(text):
	ls_text = [x.lower() for x in tokenize(text)]
	freq = FreqDist(ls_text)
	ls = freq.most_common(len(freq))
	loc_tf = {}
	for x in ls:
		loc_tf[x[0]] = x[1]
	return loc_tf
Exemplo n.º 21
0
def getAmper(lines: str):
    out = []
    for line in tokenize(lines, add=(",", "+", "-", "*", '=')):
        start = line.find("$")
        if start == -1:
            continue
        out.append(line)
    return out
Exemplo n.º 22
0
def initialize(options):
	if not OPENCL:
		options.no_ocl = True
		return []

	options.worksize = tokenize(options.worksize, 'worksize')
	options.frames = tokenize(options.frames, 'frames', [30])
	options.frameSleep = tokenize(options.frameSleep, 'frameSleep', cast=float)
	options.vectors = [True] if options.old_vectors else tokenize(options.vectors, 'vectors', [False], bool)

	platforms = cl.get_platforms()

	if options.platform >= len(platforms) or (options.platform == -1 and len(platforms) > 1):
		print 'Wrong platform or more than one OpenCL platforms found, use --platform to select one of the following\n'
		for i in xrange(len(platforms)):
			print '[%d]\t%s' % (i, platforms[i].name)
		sys.exit()

	if options.platform == -1:
		options.platform = 0

	devices = platforms[options.platform].get_devices()

	if not options.device and devices:
		print '\nOpenCL devices:\n'
		for i in xrange(len(devices)):
			print '[%d]\t%s' % (i, devices[i].name)
		print '\nNo devices specified, using all GPU devices\n'

	miners = [
		OpenCLMiner(i, options)
		for i in xrange(len(devices))
		if (
			(not options.device and devices[i].type == cl.device_type.GPU) or
			(i in options.device)
		)
	]

	for i in xrange(len(miners)):
		miners[i].worksize = options.worksize[min(i, len(options.worksize) - 1)]
		miners[i].frames = options.frames[min(i, len(options.frames) - 1)]
		miners[i].frameSleep = options.frameSleep[min(i, len(options.frameSleep) - 1)]
		miners[i].vectors = options.vectors[min(i, len(options.vectors) - 1)]
		miners[i].cutoff_temp = options.cutoff_temp[min(i, len(options.cutoff_temp) - 1)]
		miners[i].cutoff_interval = options.cutoff_interval[min(i, len(options.cutoff_interval) - 1)]
	return miners
Exemplo n.º 23
0
def _tokenize_text_for_nodes(nodes, email_text, email_idents, stops):
    documents = []
    for node in nodes:
        relevant_emails = email_idents[node.parent][node.payload]
        relevant_emails += email_idents[node.payload][node.parent]
        fulltext = '\n'.join(email_text.get(ident, '')
                             for ident in relevant_emails)
        documents.append(util.tokenize(fulltext, stops))
    return documents
Exemplo n.º 24
0
def encode_collection_with_stemming(text_by_name_p,
                                    word_dict_path,
                                    w2v_model_path,
                                    encoded_out_folder,
                                    wi=None,
                                    word_embeddings_matrix=None):
    text_by_name = {}
    print('reading files in folder')
    for filename in tqdm(os.listdir(text_by_name_p)):
        fp = os.path.join(text_by_name_p, filename)
        if os.path.isfile(fp):
            text_by_name[filename.split(r'.')[0]] = ' '.join(
                open(fp, 'r').readlines())

    # initialize embeddings matrix
    if word_embeddings_matrix is None:
        # read and adapt word index
        if wi is None:
            wi = {}
            wids_to_merge = {}
            for line in tqdm(open(word_dict_path)):
                data = line.split()
                word_stemmed = util.stem(data[0].strip())
                wid = int(data[1].strip())
                if word_stemmed not in wi.keys():
                    wi[word_stemmed] = len(wi)
                    wids_to_merge[word_stemmed] = [wid]
                else:
                    wids_to_merge[word_stemmed].append(wid)
        we_size = 50
        word_embeddings_matrix = np.float32(
            np.random.uniform(-0.02, 0.02, [len(wi) + 1, we_size]))
        padding_value = np.zeros(we_size)
        word_embeddings_matrix[word_embeddings_matrix.shape[0] -
                               1] = padding_value
        w2v_model = load_w2v_we(w2v_model_path)
        for k, v in wi.items():
            we = np.zeros(we_size)
            summed_something = False
            for wid in wids_to_merge[k]:
                if wid in w2v_model.keys():
                    we = np.sum((we, w2v_model[wid]), axis=0)
                    summed_something = True
            if summed_something:
                we = we / np.linalg.norm(we)  # normalize new word embedding
                word_embeddings_matrix[v] = we

    encoded_docs_by_name = {}
    sw = load_indri_stopwords()
    print('encoding data')
    for dn, dc in tqdm(text_by_name.items()):
        td = util.tokenize(dc, stemming=True, stoplist=sw)
        encoded_doc = [wi[w] for w in td if w in wi.keys()]
        util.save_model(encoded_doc, os.path.join(encoded_out_folder, dn))
        encoded_docs_by_name[dn] = encoded_doc
    return encoded_docs_by_name, wi, word_embeddings_matrix
Exemplo n.º 25
0
    def _get_operator_name(self):

        if "operator" in self.df.columns:
            opname = self.df.operator.astype(str).mode().iloc[0]
        else:
            opname = util.tokenize(self.filename,
                                   exclude=self.exclude,
                                   take_basename=True)[0]
        self.operator = Operator(opname)
        return self
Exemplo n.º 26
0
 def onCreate(self, create=True):
     super(Operands, self).onCreate()
     for token in tokenize(self.token, (",",)):
         if isExpression(token):
             o = Expr(token, self.line)
         else:
             o = Operand(token, self.line)
         if create:
             o.onCreate()
         self.operands.append(o)
Exemplo n.º 27
0
def encode_queries(queries_main_folder, wi, stemming):
    sw = util.load_indri_stopwords()
    encoded_qbn = {}
    for filename in tqdm(os.listdir(queries_main_folder)):
        fp = os.path.join(queries_main_folder, filename)
        if os.path.isfile(fp):
            tokenized_query = util.tokenize(' '.join(open(fp, 'r').readlines()), stemming=stemming, stoplist=sw)
            qn = filename.split(r'.')[0]
            encoded_qbn[qn] = [wi[w] for w in tokenized_query if w in wi.keys()]
    return encoded_qbn
Exemplo n.º 28
0
def test():
    print(
        "test1:displays number of features generated from all the documents\n")
    f = open('feature_definition_file', 'r')
    count = 0
    k = f.readline()
    while k:
        count += 1
        k = f.readline()
    print("number of features generated in feature_defintion_file : " +
          str(count))
    f.close()
    print(
        "test2:verified that all the documents are read and parsed from the mininewsgroup directory\n"
    )
    f = open('training_data_file', 'r')
    count = 0
    k = f.readline()
    while k:
        count += 1
        k = f.readline()
    print("number of documents parsed from mininewsgroup : " + str(count))
    print("test3 : Given a filename and filepath parse the document\n")
    fil = open('class_definition_file', "r")
    classes = {}
    r = fil.readline()
    while r:
        p = str(r.strip()).split(" ")
        if p[0] in classes:
            classes[p[0]].append(p[1])
        else:
            classes[p[0]] = [p[1]]
        r = fil.readline()
    fil.close()
    directorypath = input(
        "Enter the filepath (eg:localpath/mini_newsgroups/alt.atheism/51121):\n"
    )
    ngobj = news.News(directorypath, classes)
    print("DOCID : " + ngobj.docID)
    print("Newsgroup : " + ngobj.newsgroup)
    print("Class : " + ngobj.class_label)
    print("Subject : " + ngobj.subject)
    print("Body : " + ngobj.body)
    print("test4\n")
    print(
        "Tokenizing the subject and body of the above given file,removing stop words and stemming: \n"
    )
    print(util.tokenize(ngobj.subject + " " + ngobj.body))
    print("test5 : printing inverted index of the given file\n")
    indexobjtest = InvertedIndex()
    indexobjtest.indexDoc(ngobj)
    for key in indexobjtest.items:
        print(key + " " + str(ngobj.docID) + " " +
              str(indexobjtest.items[key].posting[ngobj.docID].positions))
Exemplo n.º 29
0
def get_meta_text(meta_dict):
    """
	Returns a list of 2-element dictionaries of the form [word, freq] from the metadata in the HTML.
	"""
    if meta_dict is None:
        return None
    descriptions = list((meta_dict['descriptions']))
    titles = list((meta_dict['titles']))
    keywords = list(meta_dict['keywords'])
    text_data = []
    if len(descriptions) > 0:
        for d in descriptions:
            text_data.extend(util.tokenize(d))
    if len(titles) > 0:
        for t in titles:
            text_data.extend(util.tokenize(t))
    if len(keywords) > 0:
        for k in keywords:
            text_data.extend(util.tokenize(k))
    return util.word_freq(text_data)
Exemplo n.º 30
0
def build_qid_map(outlines):
    qid_map = {}
    heading_map = {}
    with logger.duration('reading outlines'):
        for outline_qid, outline_text in tqdm(util.read_outlines(outlines)):
            qid_map['/'.join(outline_qid)] = outline_qid
            for heading, text in zip(outline_qid, outline_text):
                heading_map[heading] = util.tokenize(text)
        logger.debug('found {} qids, {} headings'.format(
            len(qid_map), len(heading_map)))
    return qid_map, heading_map
Exemplo n.º 31
0
def tokenize_paragraph(par):
    tok_par = dict()
    tok_par['context'] = tokenize(par['context'])
    tok_par['qas'] = list()
    for qas in par['qas']:
        try:
            tok_par['qas'].append(tokenize_qa(qas, tok_par["context"]))
        except NameError as ne:
            #print(ne)
            ''
    return tok_par
Exemplo n.º 32
0
    def executeQuery(self):
        query = open(self.nomeArqQuery).read().lower()
        query = util.tokenize(query)
        indObj = index.Index(self.nomeArqBase, self.nomeArqIndice)
        ind = indObj.loadIndex()
        indArqs = self.base.keys()

        #substitui os tokens pelos indices
        for i, v in enumerate(query):
            if v not in self.OPERATORS:
                query[i] = [int(oc.doc) for oc in ind[v]]

        #NOT
        while True:
            flag = False
            for i, v in enumerate(query):
                if v == "!":
                    flag = True
                    del query[i] #remove operator
                    query[i] = conditions.Condition.notCondition(query[i], indArqs)
            
            if not flag:
                break
        
        #AND
        while True:
            flag = False
            for i, v in enumerate(query):
                if v == "&":
                    flag = True
                    del query[i] #remove operator
                    query[i - 1] = conditions.Condition.andCondition(query[i-1], query[i]) #execute intersection
                    del query[i] #remove one of the lists
            
            if not flag:
                break
        
        #OR
        while True:
            flag = False
            for i, v in enumerate(query):
                if (v == "|"):
                    flag = True
                    del query[i] #remove operator
                    query[i - 1] = conditions.Condition.orCondition(query[i-1], query[i]) #execute intersection
                    del query[i] #remove one of the lists
            
            if not flag:
                break

        query[0].sort()
        return [self.base[q] for q in query[0]]

        
Exemplo n.º 33
0
def get_keywords(text):
    data = {}
    for i, sentence in enumerate(get_sentences(text), start=1):
        tagged_tokens = pos_tag(tokenize(sentence))
        for term in get_terms(chunk(tagged_tokens)):
            _keywords(term, i, data)
    keywords = [{
        'keyword': v['term_forms'][0],
        'count': v['count'],
        'locations': v['locations']
    } for k, v in data.items()]
    return sorted(keywords, key=itemgetter('count'), reverse=True)
Exemplo n.º 34
0
def get_keywords(text):
    data = {}
    for i, sentence in enumerate(get_sentences(text), start=1):
        tagged_tokens = pos_tag(tokenize(sentence))
        for term in get_terms(chunk(tagged_tokens)):
            _keywords(term, i, data)
    keywords = [{
        'keyword': v['term_forms'][0],
        'count': v['count'],
        'locations': v['locations']
    } for k, v in data.items()]
    return sorted(keywords, key=itemgetter('count'), reverse=True)
Exemplo n.º 35
0
def tokenize_input(input_sent, vocab):
    """
    Return a numpy array where each row is the word-indexes for each sentence
    """
    input_tok = []

    # map text to integers
    for sent in input_sent:
        text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)]

        input_tok.append(np.array(text_int))

    return np.array(input_tok)
Exemplo n.º 36
0
def tokenize_input(input_sent, vocab):
    """
    Return a numpy array where each row is the word-indexes for each sentence
    """
    input_tok = []

    # map text to integers
    for sent in input_sent:
        text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)]

        input_tok.append(np.array(text_int))

    return np.array(input_tok)
Exemplo n.º 37
0
def main():
    parser = argparse.ArgumentParser(
        "Creates a result file indicating the sentence-difference baseline.")
    parser.add_argument("--vectorspace", "-v", help="Vector space file.")
    parser.add_argument("--sentences",
                        "-s",
                        help="The appropriate .txt file in resources.")
    parser.add_argument("--vectormaker",
                        choices=("add", "mul"),
                        default="add",
                        help="Use vector addition or multiplication.")
    args = parser.parse_args()

    if args.vectormaker == "add":
        combiner = operator.add
    elif args.vectormaker == "mul":
        combiner = operator.mul
    else:
        sys.stderr.write("Don't know what to do with vectormaker '%s'." %
                         args.vectormaker)
        sys.exit(1)

    sentence_pairs = [
        l.strip().lower().split("\t") for l in open(args.sentences)
    ]
    tokenized_sentences = [(tokenize(left), tokenize(right))
                           for left, right in sentence_pairs]

    vectorspace = load_vectorspace(args.vectorspace)

    sys.stdout.write("Errors found: 0\n")
    sys.stdout.write("[")
    for left_sent, right_sent in tokenized_sentences:
        left_pieces, right_pieces = find_difference(left_sent, right_sent)
        features = extract_features(left_pieces, right_pieces, vectorspace,
                                    combiner)
        sys.stdout.write(" " + ",".join(map(str, features)))
    sys.stdout.write("]\n")
Exemplo n.º 38
0
def prep_docs(path, out_path):
    files = os.popen('ls %s*.sent' %path).read().splitlines()

    ## on the first pass, create a vocab mapping
    vocab = set()
    for file in files:
        if '-B' in file: continue
        sents = open(file).read().splitlines()
        doc = prob_util.Counter()

        for sent in sents[:20]:
            s = util.porter_stem_sent(util.tokenize(fix_text(sent)))
            concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True))
            vocab.update(concepts)

    fh = open(out_path+'vocab', 'w')
    vocab = zip(vocab, range(len(vocab)))
    for concept, count in vocab:
        fh.write('%s %d\n' %(concept, count))
    fh.close()
    vocab = dict(vocab)

    ## on the second pass, output one doc per line
    for file in files:
        if '-B' in file: continue
        sents = open(file).read().splitlines()
        doc = prob_util.Counter()

        for sent in sents[:20]:
            s = util.porter_stem_sent(util.tokenize(fix_text(sent)))
            concepts = set(util.get_ngrams(s, 1, bounds=False, as_string=True))
            for concept in concepts:
                doc[concept] += 1

        ## doc output
        output = '%d %s' %(len(doc), ' '.join(['%s:%d' %(vocab[t],c) for t,c in doc.items()]))
        print output
Exemplo n.º 39
0
def trainAll():
    trans_prob = collections.defaultdict(_constant_factory(1e-6))
    i = 0

    while i < len(english):
        if (i % size == 0):
            print("Parallel corpus Added")
            global parallel_corpus
            trans_prob = train1(english, french, trans_prob)
            with open('OutputFiles/transProb' + str(i) + '.pickle',
                      'wb') as fp:
                pickle.dump(trans_prob, fp)
            print("Done Training for lines ", i, "\n\n")
            parallel_corpus = []

            print(trans_prob[('on', 'sur')])

        parallel_corpus.append((tokenize(english[i]), tokenize(french[i])))

        i += 1

    trans_prob = train1(english, french, trans_prob)

    return trans_prob
Exemplo n.º 40
0
 def __init__(self, id, order, orig, doc, tok=None, parse=None, par=None, unresolved=False):
     self.id = id
     self.order = order
     self.orig = orig
     self.tok = tok
     self.tok2 = util.porter_stem_sent(util.tokenize(fix_text(self.orig)))
     self.doc = doc
     self.parse = parse
     self.new_par = (par == '1')
     self.length = len(self.orig.split())
     self.depends = set()
     self.groups = []
     self.skip = False
     self.skip_concepts = False
     self.unresolved = unresolved
     self.atleast = ""
Exemplo n.º 41
0
    def createIndex(self):

        indice = {}
        i = 1

        arquivosTexto = self.loadBase()

        for arquivo in arquivosTexto.values():
            arqTexto = open(arquivo)

            #separando as palavras
            tokens = util.tokenize(arqTexto.read().replace(".", " ").replace(
                ",", " ").replace("!", " ").replace("?",
                                                    " ").replace("\n",
                                                                 " ").lower())

            #para cada token
            for token in tokens:
                #se o token já estiver no dictionary
                if token in indice:
                    #se o índíce do arquivo já estiver vinculado ao token
                    if i in indice[token]:
                        indice[token][
                            i] += 1  #incrementa o número de ocorrências
                    else:
                        indice[token][i] = 1  #inicializa com uma ocorrência
                else:
                    indice[token] = {i: 1}  #inicializa o dictionary

            i += 1
            arqTexto.close()

        #ordena o índice
        indice = OrderedDict(sorted(indice.items()))
        #abre o arquivo de índice
        arqIndice = open(self.nomeArqIndiceResult, 'w')

        #para cada token no índice
        for tok, dic in indice.items():
            linha = tok + ":"
            for arq, ocor in dic.items():  #para cada arquivo que contém token
                linha = linha + " " + str(arq) + "," + str(ocor)
            arqIndice.write(linha + "\n")  #grava a linha

        #fecha os arquivos
        arqIndice.close()
Exemplo n.º 42
0
def correct(sentence):
    sentence = preprocess(sentence)
    tokens = tokenize(sentence)
    print('segment sentens is:', ''.join([str(token) for token in tokens]))
    seg_range = [[token[1], token[2]] for token in tokens]
    _, _, maybe_error_range = score_sentence(sentence)
    maybe_error_ranges = []
    if maybe_error_range:
        print('maybe error range:', maybe_error_range)
        maybe_error_ranges = merge_ranges(overlap_ranges(maybe_error_range, seg_range))
        for range in maybe_error_ranges:
            start_index, end_index = range
            print('maybe error words:', sentence[start_index:end_index])
            corrected_words = correct_chars(sentence, start_index, end_index)
            print('corrected words:', corrected_words)
            sentence = sentence[:start_index] + corrected_words + sentence[end_index:]
    return sentence, maybe_error_ranges
Exemplo n.º 43
0
def make_concepts_gold(id, path, sents, gold_sents):

    ## get gold concepts
    all_concepts = collections.defaultdict(set)
    for sent in gold_sents:
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        for concept in concepts:
            all_concepts[concept].add(sent.doc)

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        if util.is_just_stopwords(concept.split("_")):
            continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    ## get sentence concepts
    seen_sents = set()
    for sent_index in range(len(sents)):
        sent = sents[sent_index]
        sent.concepts = set([])

        ## skip some sents
        skip = False
        # if sent.order >= 3: skip = True
        if not sent.new_par:
            skip = True
        if sent.length < 20:
            skip = True

        if sent.orig in seen_sents:
            skip = True
        if sent.length <= 5:
            skip = True
        if skip:
            continue

        seen_sents.add(sent.orig)
        s = util.porter_stem_sent(util.tokenize(fix_text(sent.orig)))
        concepts = set(util.get_ngrams(s, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)

    return create_ilp_output(sents, final_concepts, path + id)
Exemplo n.º 44
0
def do_run(args):
    """
    Run the neural net to predict on new data.
    """
    # Load the model and weights
    model = load_model(args.model, args.weights)
    wvecs = WordVectorModel.from_file(args.wvecs, False, '*UNKNOWN*')

    data = ((tweet.id, tokenize(to_ascii(tweet.text))) for tweet in RowObjectFactory.from_stream(csv.reader(args.input, delimiter="\t")))
    writer = csv.writer(args.output, delimiter='\t')
    writer.writerow(['id',] + LABELS)

    for ix in tqdm(grouper(args.batch_size, data)):
        ids_batch, X_batch = zip(*ix)
        X_batch = wvecs.embed_sentences(X_batch)
        labels = model.predict_on_batch(X_batch)
        for id, label in zip(ids_batch, labels):
            writer.writerow([id,] + [float(l) for l in label])
Exemplo n.º 45
0
def parse_line(line, init=u'english', tokenize=False):
    import re

    stack = [init]
    bracketRec = re.compile(ur'(<([^> ]+)\s*[^>]*>)')

    to_return = []
    proper_line = [x.strip() for x in bracketRec.sub(ur' <\2> ', line).split()]

    if tokenize:
        import util

        # workaround for angle bracket chars
        to_join = []
        for w in proper_line:
            if bracketRec.match(w):
                to_join.append(w.replace(u'<', u'__begin__').replace(u'>', u'__end__'))
            else:
                to_join.append(w)
        to_tok = [u' '.join(to_join)]
        tokenized = util.tokenize(to_tok)[0].split()
        proper_line = []
        for t in tokenized:
            proper_line.append(t.replace(u'__begin__', u'<').replace(u'__end__', u'>'))

    for w in proper_line:
        logging.info(u'current word: {}'.format(w))
        m = bracketRec.match(w)
        if m is not None:
            lang_string = m.groups()[1]
            if lang_string == stack[-1]:
                logging.warn(u'language {} already appeared on stack for word {} in line \n {}'.format(lang_string, w, line))
            if lang_string.startswith('/'):
                stack.pop()
                if len(stack) == 0:
                    logging.warn(u'stack empty after popping {}'.format(lang_string))
                    stack.append(init)
            else:
                stack.append(lang_string)
                logging.info(u'new language: {}'.format(lang_string))
            continue
        else:
            to_return.append((w, stack[-1]))
    return to_return
Exemplo n.º 46
0
	def __init__(self, bytes, id, order, orig, doc, tok=None, par=None, unresolved=False, lang='fr'):
		self.id = id
		self.order = order
		self.orig = orig
		self.tok = tok
		if lang=='en':
			self.tok2 = util.porter_stem_sent(util.tokenize(fix_text(self.orig)))
		elif lang=='fr':
			self.tok2 = " ".join([stmr.stem(w) for w in nltk.tokenize.word_tokenize(self.orig.decode('utf8'))])
		else:
			print 'Unsupported language...'
			sys.exit(0)
#		print "TOK2", len(self.tok2)
		self.doc = doc
		self.new_par = (par == '1')
		if bytes >- 1:
			self.length = len(self.orig) #for bytes
		else:
			self.length = len(self.orig.split())
		self.unresolved = unresolved
Exemplo n.º 47
0
def train():
    """
    Accept text messages from the user and whether or not it's a positive
    or negative message to train a classifier.
    """

    db = util.Database()
    c = db.getCursor()
    while True:
        s = raw_input("Enter sentence or \".\" to stop training: ")
        if s == ".":
            break

        tokens = repr(util.tokenize(s))
        l = raw_input(
            "Is this a positive (1), neutral(0) or negative(-1) sentence: "
            )
        label = int(l)
        c.execute("INSERT INTO training_data VALUES (?,?)", (tokens, label))

    db.close()
Exemplo n.º 48
0
    stops = stops or []
    counts = defaultdict(int)
    for token in tokens:
        if token not in stops:
            counts[token] += 1
    return sorted((count, token) for token, count in counts.items())

def top_tokens(tokens):
    return [item for item in reversed(tokens[-10:])]

def unique_tokens(tokens):
    return [token for count, token in tokens if count == 1]

token_map = {}
for tweet in tweets():
    for token, tag in tag_tweets(tokenize(tweet['text'])):
        token_map.setdefault(tag, []).append(token)

for tag, tokens in token_map.items():
    print "% 10s % 8s % 6s" % (tag, len(tokens), len(set(tokens)))

print ""
print "EMOTICONS"
print "========="
counted = count_tokens(token_map['emoticon'])
print top_tokens(counted)
print len(unique_tokens(counted))
print ""

print "URLS"
print "===="
Exemplo n.º 49
0
 def testcorrect_token_contents(self):
     self.assertEquals( util.tokenize("  \t\n\rfred jim \tbob   \t"), ["fred", "jim", "bob"])
Exemplo n.º 50
0
 def testcorrect_num_tokens(self):
     self.assertEquals( len(util.tokenize("a b c d \t\n\rfred")), 5)
Exemplo n.º 51
0
 def testepsilon(self):
     self.assertEquals( util.tokenize(""), [])
Exemplo n.º 52
0
def make_concepts_exp(id, path, sents, query):
    """
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())

    ## get sentence values
    sent_vals = prob_util.Counter()
    for sent in sents:
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)
        sent_vals[sent] = max(0, len(query_overlap))
        #if sent.relevance < 0.3: sent_vals[sent] = 0.0
        #else: sent_vals[sent] = 100000**sent.relevance
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = set()
        for concept in concepts:
            if util.is_just_stopwords(concept.split('_')): continue
            sent.concepts.add(concept)
    sent_vals = prob_util.normalize(sent_vals)

    ## get concept values
    concept_vals = prob_util.Counter()
    for sent in sents:
        for concept in sent.concepts:
            concept_vals[concept] += sent_vals[sent]            
    concept_vals = prob_util.normalize(concept_vals)
    
    iter = 0
    while True:
        iter += 1
        se = prob_util.entropy(sent_vals)
        ce = prob_util.entropy(concept_vals)
        print >>sys.stderr, 'iter [%d] sent entropy [%1.4f] concept entropy [%1.4f]' %(iter, se, ce)
        if iter >= 1: break
        
        ## get sent vals again
        sent_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                sent_vals[sent] += concept_vals[concept]
        sent_vals = prob_util.normalize(sent_vals)
        
        ## get concept values
        concept_vals = prob_util.Counter()
        for sent in sents:
            for concept in sent.concepts:
                concept_vals[concept] += sent_vals[sent]            
        concept_vals = prob_util.normalize(concept_vals)
    
    sorted_sents = sent_vals.sortedKeys()
    #for sent in sorted_sents:
    #    print sent_vals[sent], sent.order, sent.new_par, sent

    sorted_concepts = concept_vals.sortedKeys()
    #for concept in sorted_concepts:
    #    print concept_vals[concept], concept
        
    ## create final concept set
    final_concepts = {}
    for concept in sorted_concepts:
        val = concept_vals[concept]
        #if val < 0.00001: continue
        final_concepts[concept] = val
    final_concept_set = set(final_concepts.keys())

    ## get final sentence list and their concepts
    seen_sents = set()
    for sent in sents:
        skip = False
        if sent.length <= 5: skip = True
        if sent in seen_sents: skip = True
        if sent.order > 0: skip = True
        else: seen_sents.add(sent)
        if skip: sent.concepts = set()
        else: sent.concepts = sent.concepts.intersection(final_concept_set)        
        
    return create_ilp_output(sents, final_concepts, path+id)
Exemplo n.º 53
0
def make_concepts_compress2(id, path, sents, query, compressed_sents):
    """
    """
    
    query_words = set(util.porter_stem_sent(util.remove_stopwords(util.tokenize(fix_text(query)))).split())
    seen_sents = set()
    all_concepts = collections.defaultdict(set)
    ## different processing for set A and set B
    if '-B' in id: 
        first_weight = 2
        count_thresh = 4
        query_thresh = 0
    else: 
        first_weight = 1
        count_thresh = 3
        query_thresh = 1

    for sent in sents:
        
        ## store this sentence's concepts
        sent.concepts = set()
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))

        ## get query overlap
        query_overlap = set(util.remove_stopwords(sent.tok2.split())).intersection(query_words)

        ## aggregate all concepts
        if len(query_overlap) >= query_thresh:
            for concept in concepts:
                if sent.order == 0: all_concepts[concept].add('first' + sent.doc)
                else: all_concepts[concept].add(sent.doc)

        ## ignore some sents
        skip = False
        #if not sent.new_par: skip = True
        #if sent.length <= 20: skip = True
        if sent.tok in seen_sents: skip = True
        #if sent.ignore: skip = True
        if skip: continue
        
        seen_sents.add(sent.tok)
        sent.concepts = concepts

    ## create final concept set
    final_concepts = {}
    for concept, docs in all_concepts.items():
        count = len(docs)
        firsts = len([1 for d in docs if 'first' in d])
        count = count + (first_weight * firsts)
        if count < count_thresh: continue
        if util.is_just_stopwords(concept.split('_')): continue
        final_concepts[concept] = count
    final_concept_set = set(final_concepts.keys())

    for sent in sents:
        sent.concepts = sent.concepts.intersection(final_concept_set)
        
    for sent in compressed_sents:
        sent.concepts = set([])
        if sent.unresolved: continue
        if sent.length < 10: continue
        if re.match('^["(].*[")]$', sent.orig): skip = True
        concepts = set(util.get_ngrams(sent.tok2, 2, bounds=False, as_string=True))
        sent.concepts = concepts.intersection(final_concept_set)
        
    return create_ilp_output(compressed_sents, final_concepts, path+id)
Exemplo n.º 54
0
 def testonly_whitespace(self):
     self.assertEquals( util.tokenize("\t\n\r      "), [])
Exemplo n.º 55
0
Arquivo: poclbm.py Projeto: bfx/poclbm
group.add_option('--vv',             dest='vectors',    default=[],          help='use vectors, default false')
group.add_option('-v', '--vectors',  dest='old_vectors',action='store_true', help='use vectors')
parser.add_option_group(group)

(options, options.servers) = parser.parse_args()

log.verbose = options.verbose
log.quiet = options.quiet

options.rate = if_else(options.verbose, max(options.rate, 60), max(options.rate, 0.1))

options.version = VERSION

options.max_update_time = 60

options.device = tokenize(options.device, 'device', [])

options.cutoff_temp = tokenize(options.cutoff_temp, 'cutoff_temp', [95], float)
options.cutoff_interval = tokenize(options.cutoff_interval, 'cutoff_interval', [0.01], float)

switch = None
try:
	switch = Switch(options)

	if not options.no_ocl:
		import OpenCLMiner
		for miner in OpenCLMiner.initialize(options):
			switch.add_miner(miner)

	if not options.no_bfl:
		import BFLMiner
Exemplo n.º 56
0
def filterComment(source, startLine, codeFile, commentFile, maxBucket):
    """ Find the comment at line i in the list source. When found check for 
    a multiline comment and get the corresponding code """

    comment = ""
    indentation = -1
    currIndent = -1
    code = []
    globalI = len(source) + 10

    # loop through all the lines in the source, get the comment 
    # and the corresponding code
    with open(commentFile, "a") as commentF:
        with open(codeFile, "a") as codeF:
            for i in xrange(startLine, len(source)):

                globalI = i
                line = source[i]

                # comments need to be directly above code
                if line.strip() == "" and comment == "":
                    return (i,False)

                # Continue if we have an divider row
                if line.replace("#", "").strip() == "" and line.strip() != "":
                    continue

                # check if it is an comment, and if so add it to the comment
                if line.strip()[:2] in commentList:
                    comment += line.strip().replace("#", "") + " "
                    continue

                # lines with docstrings are skipped
                if '"""' in line or "'''" in line:
                    return (i,False)

                # if we get here, it means we are not in the comment anymore
                # First get the indentation level of the current line of code
                currIndent = len(line) - len(line.lstrip())

                # If it is the first line of code, set our indentation level
                if indentation == -1:
                    indentation = currIndent

                # if we hit an empty line and have no code yet, return with an error 
                if line.strip() == "" and code == []:
                    return (i,False)

                # if we hit an empty line or go to an parent piece in the code
                # return the gathered code
                if line.strip() == "" or indentation > currIndent or (any(c in line for c in commentList)):
                    code = util.cleanCode(code)

                    # no need to save code-comment pairs larger than maxBucket size
                    if util.tokenize("".join(code)) < maxBucket[0] and util.tokenize(comment) < maxBucket[1] \
                    and not (any(exc in comment.lower() for exc in commentExceptions)):
                        # write to file
                        for j in xrange(len(code)):
                            codeF.write(code[j] + "\n")
                        codeF.write("!@#$%!@#$%!@#$%!@#$%!@#$%")
                        commentF.write(util.cleanComment(comment) + "\n!@#$%!@#$%!@#$%!@#$%!@#$%")

                        return (i,True)
                    else:
                        return (i,False)

                # add the line to our code if all is well (without any inline comments if any)
                if line.strip() != "":
                    code.append(line)

            code = util.cleanCode(code)

            # if we are here check if we have a comment / code not empty and smaller than maxBucket size
            if comment.strip() != "" and code != [] and \
            util.tokenize("".join(code)) < maxBucket[0] and util.tokenize(comment) < maxBucket[1] \
            and not (any(exc in comment.lower() for exc in commentExceptions)):
                # write to file
                for j in xrange(len(code)):
                    codeF.write(code[j] + "\n")
                codeF.write("!@#$%!@#$%!@#$%!@#$%!@#$%")
                commentF.write(util.cleanComment(comment) + "\n!@#$%!@#$%!@#$%!@#$%!@#$%")

                return (globalI+1,True)
            else:
                return (globalI+1,False)
Exemplo n.º 57
0
 def __iter__(self):
     for text in util.iter_corpus(self.dirname):
         # tokenize each work
         yield util.tokenize(text)
Exemplo n.º 58
0
def tokenize(src):
    for token_info in util.tokenize(src):
        token_class = TOKENS[token_info.type]
        yield token_class(*token_info[1:])
Exemplo n.º 59
0
def load_data(path, file_ext=['txt'], valid_split=None, vocab_file_name=None,
              max_vocab_size=None, max_len_w=None, output_path=None, subset_pct=100):
    """
    Given a path where data are saved, look for the ones with the right extensions
    If a split factor is given, it will split all the files into training and valid
    set. Then build vocabulary from the training and validation sets.

    Arguments:
        path: which directory to look for all the documents
        file_ext: what extension of the files to look for
        valid_split: to split the data into train/valid set. If None, no split
        vocab_file_name: optional file name. If None, the script will decide a name
                         given path and split
        max_vocab_size: maximum number of words to use in vocabulary (by most frequent)
        max_len_w: maximum length of sentences in words
        output_path: path used to save preprocessed data and resuts
        subset_pct: subset of dataset to load into H5 file (percentage)

    Returns:
        The function saves 2 files:
        h5 file with preprocessed data
        vocabulary file with: vocab, reverse_vocab, word_count
    """
    file_names = get_file_list(path, file_ext)

    file_str = get_file_str(path, len(file_names), labelled=False,
                            valid_split=valid_split, subset_pct=subset_pct)

    # create output dir if needed
    if not os.path.isdir(output_path):
        os.makedirs(output_path)

    # file name to store the vocabulary
    if vocab_file_name is None:
        vocab_file_name = file_str + '.vocab'
        vocab_file_name = os.path.join(output_path, vocab_file_name)

    # If max sizes arent set, assume no limit
    if not max_len_w:
        max_len_w = sys.maxsize
    if not max_vocab_size:
        max_vocab_size = sys.maxsize

    # file name to store the pre-processed train/valid dataset
    h5_file_name = os.path.join(output_path, file_str + '.h5')

    if os.path.exists(h5_file_name) and os.path.exists(vocab_file_name):
        neon_logger.display("dataset files {} and vocabulary file {} already exist. "
                            "will use cached data. ".format(h5_file_name, vocab_file_name))
        return h5_file_name, vocab_file_name

    # split into training/valid set
    if valid_split is not None:
        if 'json' in file_ext:
            # Split based on number of files
            train_split = int(np.ceil(len(file_names) * (1 - valid_split)))
            train_files = file_names[:train_split]
            valid_files = file_names[train_split:]

            train_sent = load_json_sent(train_files, subset_pct)
            valid_sent = load_json_sent(valid_files, subset_pct)
            all_sent = train_sent + valid_sent
        elif 'txt' in file_ext:
            # Split based on number of lines (since only 2 files)
            all_sent = load_txt_sent(file_names, subset_pct)
            train_split = int(np.ceil(len(all_sent) * (1 - valid_split)))

            train_sent = all_sent[:train_split]
            valid_sent = all_sent[train_split:]
        else:
            neon_logger.display("Unsure how to load file_ext {}, please use 'json' or 'txt'."
                                .format(file_ext))
    else:
        train_files = file_names
        if 'json' in file_ext:
            train_sent = load_json_sent(train_files, subset_pct)
        elif 'txt' in file_ext:
            train_sent = load_txt_sent(train_files, subset_pct)
        else:
            neon_logger.display("Unsure how to load file_ext {}, please use 'json' or 'txt'."
                                .format(file_ext))
        all_sent = train_sent

    if os.path.exists(vocab_file_name):
        neon_logger.display("open existing vocab file: {}".format(vocab_file_name))
        vocab, rev_vocab, word_count = load_obj(vocab_file_name)
    else:
        neon_logger.display("Building  vocab file")

        # build vocab
        word_count = defaultdict(int)
        for sent in all_sent:
            sent_words = tokenize(sent)

            if len(sent_words) > max_len_w or len(sent_words) == 0:
                continue

            for word in sent_words:
                word_count[word] += 1

        # sort the word_count , re-assign ids by its frequency. Useful for downstream tasks
        # only done for train vocab
        vocab_sorted = sorted(word_count.items(), key=lambda kv: kv[1], reverse=True)

        vocab = OrderedDict()

        # get word count as array in same ordering as vocab (but with maximum length)
        word_count_ = np.zeros((len(word_count), ), dtype=np.int64)
        for i, t in enumerate(list(zip(*vocab_sorted))[0][:max_vocab_size]):
            word_count_[i] = word_count[t]
            vocab[t] = i
        word_count = word_count_

        # generate the reverse vocab
        rev_vocab = dict((wrd_id, wrd) for wrd, wrd_id in vocab.items())

        neon_logger.display("vocabulary from {} is saved into {}".format(path, vocab_file_name))
        save_obj((vocab, rev_vocab, word_count), vocab_file_name)

    vocab_size = len(vocab)
    neon_logger.display("\nVocab size from the dataset is: {}".format(vocab_size))

    neon_logger.display("\nProcessing and saving training data into {}".format(h5_file_name))

    # now process and save the train/valid data
    h5f = h5py.File(h5_file_name, 'w', libver='latest')
    shape, maxshape = (len(train_sent),), (None)
    dt = np.dtype([('text', h5py.special_dtype(vlen=str)),
                   ('num_words', np.uint16)])
    report_text_train = h5f.create_dataset('report_train', shape=shape,
                                           maxshape=maxshape, dtype=dt,
                                           compression='gzip')
    report_train = h5f.create_dataset('train', shape=shape, maxshape=maxshape,
                                      dtype=h5py.special_dtype(vlen=np.int32),
                                      compression='gzip')

    # map text to integers
    wdata = np.zeros((1, ), dtype=dt)
    ntrain = 0
    for sent in train_sent:
        text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)]

        # enforce maximum sentence length
        if len(text_int) > max_len_w or len(text_int) == 0:
            continue

        report_train[ntrain] = text_int

        wdata['text'] = clean_string(sent)
        wdata['num_words'] = len(text_int)
        report_text_train[ntrain] = wdata
        ntrain += 1

    report_train.attrs['nsample'] = ntrain
    report_train.attrs['vocab_size'] = vocab_size
    report_text_train.attrs['nsample'] = ntrain
    report_text_train.attrs['vocab_size'] = vocab_size

    if valid_split:
        neon_logger.display("\nProcessing and saving validation data into {}".format(h5_file_name))
        shape = (len(valid_sent),)
        report_text_valid = h5f.create_dataset('report_valid', shape=shape,
                                               maxshape=maxshape, dtype=dt,
                                               compression='gzip')
        report_valid = h5f.create_dataset('valid', shape=shape, maxshape=maxshape,
                                          dtype=h5py.special_dtype(vlen=np.int32),
                                          compression='gzip')
        nvalid = 0
        for sent in valid_sent:
            text_int = [-1 if t not in vocab else vocab[t] for t in tokenize(sent)]

            # enforce maximum sentence length
            if len(text_int) > max_len_w or len(text_int) == 0:
                continue

            report_valid[nvalid] = text_int
            wdata['text'] = clean_string(sent)
            wdata['num_words'] = len(text_int)
            report_text_valid[nvalid] = wdata
            nvalid += 1

        report_valid.attrs['nsample'] = nvalid
        report_valid.attrs['vocab_size'] = vocab_size
        report_text_valid.attrs['nsample'] = nvalid
        report_text_valid.attrs['vocab_size'] = vocab_size

    h5f.close()

    return h5_file_name, vocab_file_name